Convert SCI literature PDF to Markdown using Python

This is a small tool I wrote while writing a literature review to facilitate the organization of literature, which can convert SCI literature PDFs to Markdown format. This way, you can directly edit the literature in a Markdown editor, making it easier to organize, translate, and write.

The specific code is as follows:

import requests
import random
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import scipdf
import string
from datetime import datetime

# Custom exception for handling GROBID service exceptions
class GROBID_OFFLINE_EXCEPTION(Exception):
    pass

class PDFToMarkdown:
    def __init__(self, input_path, grobid_urls=None):
        """
        Initialize PDFToMarkdown instance.

        Args:
            input_path (str): Path to the file or folder to be processed.
            grobid_urls (list): Optional, list of GROBID server URLs. Defaults to preset URLs.
        """
        self.input_path = input_path
        # Use custom GROBID server if provided, otherwise use default servers
        self.grobid_urls = grobid_urls if grobid_urls is not None else [
            "https://qingxu98-grobid.hf.space",
            "https://qingxu98-grobid2.hf.space",
            # ... (other server URLs)
            "https://qingxu98-grobid8.hf.space",
        ]

    def get_avail_grobid_url(self):
        """Get an available GROBID server URL"""
        if not self.grobid_urls:
            return None

        while self.grobid_urls:
            _grobid_url = random.choice(self.grobid_urls)  # Randomly select a GROBID URL
            if _grobid_url.endswith('/'):
                _grobid_url = _grobid_url.rstrip('/')
            try:
                # Check if the server is online
                res = requests.get(f"{_grobid_url}/api/isalive", timeout=5)
                if res.text == 'true':
                    return _grobid_url  # Return available URL
            except (requests.ConnectionError, requests.Timeout):
                # If connection error or timeout, remove this URL from the list
                self.grobid_urls.remove(_grobid_url)
        return None  # Return None if no available servers

    @staticmethod
    def dict_to_markdown(article_json):
        """Convert article dictionary to Markdown format string"""
        markdown_lines = []
        markdown_lines.append(f"# {article_json.get('title', 'Untitled')} \n")  # Title
        markdown_lines.append(f"> doi:{article_json.get('doi', '')} \n")  # DOI
        markdown_lines.append(f"+ authors\n{article_json.get('authors', ['No authors'])}  \n")  # Authors
        markdown_lines.append(f"+ abstract\n{article_json.get('abstract', 'No abstract')}  \n")  # Abstract

        # Process each section's content
        if 'sections' in article_json:
            for section in article_json['sections']:
                markdown_lines.append(f"+ {section['heading']}\n{section['text']}\n")  # Section title and content

        return "\n".join(markdown_lines)  # Return merged Markdown string

    @staticmethod
    def save_markdown_file(filename, content):
        """Write content to Markdown file"""
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(content)  # Write content to file

    def parse_pdf(self, pdf_path, grobid_url):
        """Parse a single PDF file, returning an article dictionary"""
        if not os.path.isfile(pdf_path):
            raise FileNotFoundError(f"No PDF file found at the specified path: {pdf_path}")  # Check if file exists

        if grobid_url.endswith('/'):
            grobid_url = grobid_url.rstrip('/')

        try:
            # Use GROBID to parse PDF
            return scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url)
        except GROBID_OFFLINE_EXCEPTION:
            raise GROBID_OFFLINE_EXCEPTION("GROBID service is unavailable, check GROBID_URL in the configuration.")
        except RuntimeError:
            raise RuntimeError("Failed to parse PDF, please check if the PDF is corrupted.")

    def process_pdf_file(self, pdf_path, grobid_url):
        """Process a single PDF file, returning Markdown content"""
        print(f"Parsing: {pdf_path}")
        try:
            pdf_article_dict = self.parse_pdf(pdf_path, grobid_url)  # Parse PDF file
            return self.dict_to_markdown(pdf_article_dict)  # Convert to Markdown
        except Exception as e:
            print(f"Error occurred while processing file {pdf_path}: {e}")
            return None  # Return None on error

    def process(self):
        """Process input file or folder and return the generated Markdown file path"""
        markdown_contents = []  # Store all Markdown content
        grobid_url = self.get_avail_grobid_url()

        if grobid_url is None:
            raise RuntimeError("No available GROBID service, please check your server configuration.")

        # Determine if the input path is a file or a folder
        if os.path.isfile(self.input_path):
            pdf_files = [self.input_path]  # Single file
        elif os.path.isdir(self.input_path):
            # Collect all PDF files in the folder
            pdf_files = [os.path.join(dirpath, filename)
                         for dirpath, _, filenames in os.walk(self.input_path)
                         for filename in filenames if filename.endswith('.pdf')]
        else:
            raise ValueError("Input path is neither a file nor a folder.")

        # Use thread pool to process PDF files in parallel
        with ThreadPoolExecutor(max_workers=5) as executor:
            future_to_file = {executor.submit(self.process_pdf_file, pdf, grobid_url): pdf for pdf in pdf_files}

            # Collect generated Markdown content
            for future in as_completed(future_to_file):
                result = future.result()
                if result:
                    markdown_contents.append(result)

        # If there is valid Markdown content, save it to a file
        if markdown_contents:
            # Generate timestamp
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            # Generate two random letters
            random_suffix = ''.join(random.choices(string.ascii_lowercase, k=2))
            output_filename = f"{timestamp}_{random_suffix}.md"
            self.save_markdown_file(output_filename, "\n\n".join(markdown_contents))  # Merge and save as Markdown file
            print(f"All Markdown files have been merged and saved as {output_filename}")
            return output_filename  # Return generated file path
        else:
            print("No valid Markdown content generated.")
            return None


# If this script is run directly, provide usage example
if __name__ == "__main__":
    input_path = 'your_file_or_directory_path'  # Replace with your file or directory path
    custom_grobid_urls = [
        "https://your-custom-grobid-server.com",
        "https://another-custom-grobid-server.com",
    ]
    pdf_to_markdown = PDFToMarkdown(input_path, grobid_urls=custom_grobid_urls)
    output_file = pdf_to_markdown.process()  # Process PDF files and generate Markdown file
    print("Generated file path:", output_file)  # Output generated file path

Note that you need to install the following Python library:

pip install git+https://github.com/titipata/scipdf_parser

When using, replace input_path with your file or directory path. This script is multithreaded and can process all PDF files in a folder. If you have your own GROBID server, you can add it to the custom_grobid_urls list; otherwise, it will use the default GROBID servers. Ultimately, a Markdown file will be generated containing the content of all PDF files.

References: