This is a small tool I wrote while writing a literature review to facilitate the organization of literature, which can convert SCI literature PDFs to Markdown format. This way, you can directly edit the literature in a Markdown editor, making it easier to organize, translate, and write.
The specific code is as follows:
import requests
import random
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import scipdf
import string
from datetime import datetime
# Custom exception for handling GROBID service exceptions
class GROBID_OFFLINE_EXCEPTION(Exception):
pass
class PDFToMarkdown:
def __init__(self, input_path, grobid_urls=None):
"""
Initialize PDFToMarkdown instance.
Args:
input_path (str): Path to the file or folder to be processed.
grobid_urls (list): Optional, list of GROBID server URLs. Defaults to preset URLs.
"""
self.input_path = input_path
# Use custom GROBID server if provided, otherwise use default servers
self.grobid_urls = grobid_urls if grobid_urls is not None else [
"https://qingxu98-grobid.hf.space",
"https://qingxu98-grobid2.hf.space",
# ... (other server URLs)
"https://qingxu98-grobid8.hf.space",
]
def get_avail_grobid_url(self):
"""Get an available GROBID server URL"""
if not self.grobid_urls:
return None
while self.grobid_urls:
_grobid_url = random.choice(self.grobid_urls) # Randomly select a GROBID URL
if _grobid_url.endswith('/'):
_grobid_url = _grobid_url.rstrip('/')
try:
# Check if the server is online
res = requests.get(f"{_grobid_url}/api/isalive", timeout=5)
if res.text == 'true':
return _grobid_url # Return available URL
except (requests.ConnectionError, requests.Timeout):
# If connection error or timeout, remove this URL from the list
self.grobid_urls.remove(_grobid_url)
return None # Return None if no available servers
@staticmethod
def dict_to_markdown(article_json):
"""Convert article dictionary to Markdown format string"""
markdown_lines = []
markdown_lines.append(f"# {article_json.get('title', 'Untitled')} \n") # Title
markdown_lines.append(f"> doi:{article_json.get('doi', '')} \n") # DOI
markdown_lines.append(f"+ authors\n{article_json.get('authors', ['No authors'])} \n") # Authors
markdown_lines.append(f"+ abstract\n{article_json.get('abstract', 'No abstract')} \n") # Abstract
# Process each section's content
if 'sections' in article_json:
for section in article_json['sections']:
markdown_lines.append(f"+ {section['heading']}\n{section['text']}\n") # Section title and content
return "\n".join(markdown_lines) # Return merged Markdown string
@staticmethod
def save_markdown_file(filename, content):
"""Write content to Markdown file"""
with open(filename, 'w', encoding='utf-8') as f:
f.write(content) # Write content to file
def parse_pdf(self, pdf_path, grobid_url):
"""Parse a single PDF file, returning an article dictionary"""
if not os.path.isfile(pdf_path):
raise FileNotFoundError(f"No PDF file found at the specified path: {pdf_path}") # Check if file exists
if grobid_url.endswith('/'):
grobid_url = grobid_url.rstrip('/')
try:
# Use GROBID to parse PDF
return scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url)
except GROBID_OFFLINE_EXCEPTION:
raise GROBID_OFFLINE_EXCEPTION("GROBID service is unavailable, check GROBID_URL in the configuration.")
except RuntimeError:
raise RuntimeError("Failed to parse PDF, please check if the PDF is corrupted.")
def process_pdf_file(self, pdf_path, grobid_url):
"""Process a single PDF file, returning Markdown content"""
print(f"Parsing: {pdf_path}")
try:
pdf_article_dict = self.parse_pdf(pdf_path, grobid_url) # Parse PDF file
return self.dict_to_markdown(pdf_article_dict) # Convert to Markdown
except Exception as e:
print(f"Error occurred while processing file {pdf_path}: {e}")
return None # Return None on error
def process(self):
"""Process input file or folder and return the generated Markdown file path"""
markdown_contents = [] # Store all Markdown content
grobid_url = self.get_avail_grobid_url()
if grobid_url is None:
raise RuntimeError("No available GROBID service, please check your server configuration.")
# Determine if the input path is a file or a folder
if os.path.isfile(self.input_path):
pdf_files = [self.input_path] # Single file
elif os.path.isdir(self.input_path):
# Collect all PDF files in the folder
pdf_files = [os.path.join(dirpath, filename)
for dirpath, _, filenames in os.walk(self.input_path)
for filename in filenames if filename.endswith('.pdf')]
else:
raise ValueError("Input path is neither a file nor a folder.")
# Use thread pool to process PDF files in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
future_to_file = {executor.submit(self.process_pdf_file, pdf, grobid_url): pdf for pdf in pdf_files}
# Collect generated Markdown content
for future in as_completed(future_to_file):
result = future.result()
if result:
markdown_contents.append(result)
# If there is valid Markdown content, save it to a file
if markdown_contents:
# Generate timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Generate two random letters
random_suffix = ''.join(random.choices(string.ascii_lowercase, k=2))
output_filename = f"{timestamp}_{random_suffix}.md"
self.save_markdown_file(output_filename, "\n\n".join(markdown_contents)) # Merge and save as Markdown file
print(f"All Markdown files have been merged and saved as {output_filename}")
return output_filename # Return generated file path
else:
print("No valid Markdown content generated.")
return None
# If this script is run directly, provide usage example
if __name__ == "__main__":
input_path = 'your_file_or_directory_path' # Replace with your file or directory path
custom_grobid_urls = [
"https://your-custom-grobid-server.com",
"https://another-custom-grobid-server.com",
]
pdf_to_markdown = PDFToMarkdown(input_path, grobid_urls=custom_grobid_urls)
output_file = pdf_to_markdown.process() # Process PDF files and generate Markdown file
print("Generated file path:", output_file) # Output generated file path
Note that you need to install the following Python library:
pip install git+https://github.com/titipata/scipdf_parser
When using, replace input_path
with your file or directory path. This script is multithreaded and can process all PDF files in a folder. If you have your own GROBID server, you can add it to the custom_grobid_urls
list; otherwise, it will use the default GROBID servers. Ultimately, a Markdown file will be generated containing the content of all PDF files.
References: