Using OpenAI to Translate Markdown Files with High Quality

Previously, a tool was written to convert SCI literature PDFs into Markdown format. Now, this tool has been created to enhance the toolchain, allowing for immediate translation of the literature into Chinese after conversion.

First, install the dependencies

pip install openai

The business code is as follows:

import openai
import json
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class MarkdownTranslator:
    def __init__(self, config_file):
        self.config = self.load_config(config_file)
        openai.api_key = self.config.get('OPENAI_API_KEY')
        openai.base_url = self.config.get('OPENAI_API_BASE')
        openai.default_headers = {"x-foo": "true"}

    # Get OpenAI API key and custom server address from the config file
    def load_config(self, config_file):
        try:
            with open(config_file, 'r', encoding='utf-8') as file:
                config = json.load(file)
                return config
        except Exception as e:
            logging.error(f"Error reading config file {config_file}: {e}")
            raise

    # Read Markdown file
    def read_markdown(self, file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except Exception as e:
            logging.error(f"Error reading file {file_path}: {e}")
            raise

    # Write translated content to a new Markdown file
    def write_markdown(self, file_path, content):
        try:
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(content)
        except Exception as e:
            logging.error(f"Error writing file {file_path}: {e}")
            raise

    # Translation function
    def translate_text(self, text, source_lang='en', target_lang='zh'):
        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": f"Please translate the following {source_lang} text into {target_lang}:\n{text}"
                    }
                ]
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            logging.error(f"Error translating text: {e}")
            return text  # Return original text to prevent translation failure

    # Process Markdown content
    def process_markdown_content(self, content, source_lang, target_lang):
        lines = content.split('\n')
        translated_lines = []

        def translate_line(index, line):
            if line.strip():  # Ignore empty lines
                translated_line = self.translate_text(line, source_lang, target_lang)
                translated_lines.append((index, translated_line))
            else:
                translated_lines.append((index, ''))  # Keep empty lines

        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = [executor.submit(translate_line, i, line) for i, line in enumerate(lines)]
            for future in as_completed(futures):
                future.result()  # Wait for all threads to complete

        # Sort by original order
        translated_lines.sort(key=lambda x: x[0])
        return '\n'.join(line for _, line in translated_lines)

    # Translate file
    def translate_file(self, input_file, output_file, source_lang='en', target_lang='zh'):
        # Print parameters
        logging.info(f"Translating file from {source_lang} to {target_lang}...")
        logging.info(f"OpenAi_key: {openai.api_key}")
        logging.info(f"OpenAi_base: {openai.base_url}")

        # Read original Markdown file
        markdown_content = self.read_markdown(input_file)

        # Process and translate content
        translated_content = self.process_markdown_content(markdown_content, source_lang, target_lang)

        # Write to new Markdown file
        self.write_markdown(output_file, translated_content)

if __name__ == "__main__":
    # Input and output file paths
    input_file_path = 'input.md'   # Input Markdown file
    output_file_path = 'output.md'  # Output Markdown file

    # Optional source and target languages
    source_language = 'en'  # Source language (default is English)
    target_language = 'zh'  # Target language (default is Chinese)

    translator = MarkdownTranslator('config.json')
    translator.translate_file(input_file_path, output_file_path, source_language, target_language)

The content of the config.json file is as follows:

{
    "OPENAI_API_KEY": "your_openai_api_key",
    "OPENAI_API_BASE": "https://api.openai.com" # Custom server address that meets OpenAI API format
}

The usage of this tool is also very simple; you just need to specify the input Markdown file path and the output Markdown file path, as well as optional source and target languages, to translate the English content in the Markdown file into Chinese.