Previously, a tool was written to convert SCI literature PDFs into Markdown format. Now, this tool has been created to enhance the toolchain, allowing for immediate translation of the literature into Chinese after conversion.
First, install the dependencies
pip install openai
The business code is as follows:
import openai
import json
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class MarkdownTranslator:
def __init__(self, config_file):
self.config = self.load_config(config_file)
openai.api_key = self.config.get('OPENAI_API_KEY')
openai.base_url = self.config.get('OPENAI_API_BASE')
openai.default_headers = {"x-foo": "true"}
# Get OpenAI API key and custom server address from the config file
def load_config(self, config_file):
try:
with open(config_file, 'r', encoding='utf-8') as file:
config = json.load(file)
return config
except Exception as e:
logging.error(f"Error reading config file {config_file}: {e}")
raise
# Read Markdown file
def read_markdown(self, file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
logging.error(f"Error reading file {file_path}: {e}")
raise
# Write translated content to a new Markdown file
def write_markdown(self, file_path, content):
try:
with open(file_path, 'w', encoding='utf-8') as file:
file.write(content)
except Exception as e:
logging.error(f"Error writing file {file_path}: {e}")
raise
# Translation function
def translate_text(self, text, source_lang='en', target_lang='zh'):
try:
response = openai.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": f"Please translate the following {source_lang} text into {target_lang}:\n{text}"
}
]
)
return response.choices[0].message.content.strip()
except Exception as e:
logging.error(f"Error translating text: {e}")
return text # Return original text to prevent translation failure
# Process Markdown content
def process_markdown_content(self, content, source_lang, target_lang):
lines = content.split('\n')
translated_lines = []
def translate_line(index, line):
if line.strip(): # Ignore empty lines
translated_line = self.translate_text(line, source_lang, target_lang)
translated_lines.append((index, translated_line))
else:
translated_lines.append((index, '')) # Keep empty lines
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(translate_line, i, line) for i, line in enumerate(lines)]
for future in as_completed(futures):
future.result() # Wait for all threads to complete
# Sort by original order
translated_lines.sort(key=lambda x: x[0])
return '\n'.join(line for _, line in translated_lines)
# Translate file
def translate_file(self, input_file, output_file, source_lang='en', target_lang='zh'):
# Print parameters
logging.info(f"Translating file from {source_lang} to {target_lang}...")
logging.info(f"OpenAi_key: {openai.api_key}")
logging.info(f"OpenAi_base: {openai.base_url}")
# Read original Markdown file
markdown_content = self.read_markdown(input_file)
# Process and translate content
translated_content = self.process_markdown_content(markdown_content, source_lang, target_lang)
# Write to new Markdown file
self.write_markdown(output_file, translated_content)
if __name__ == "__main__":
# Input and output file paths
input_file_path = 'input.md' # Input Markdown file
output_file_path = 'output.md' # Output Markdown file
# Optional source and target languages
source_language = 'en' # Source language (default is English)
target_language = 'zh' # Target language (default is Chinese)
translator = MarkdownTranslator('config.json')
translator.translate_file(input_file_path, output_file_path, source_language, target_language)
The content of the config.json file is as follows:
{
"OPENAI_API_KEY": "your_openai_api_key",
"OPENAI_API_BASE": "https://api.openai.com" # Custom server address that meets OpenAI API format
}
The usage of this tool is also very simple; you just need to specify the input Markdown file path and the output Markdown file path, as well as optional source and target languages, to translate the English content in the Markdown file into Chinese.