Pythonを使用してSCI文献のPDFをMarkdownに変換する

これは以前文献レビューを書くときに、文献を整理するために作成した小さなツールで、SCI 文献の PDF を Markdown 形式に変換できます。こうすることで、Markdown エディタで文献を直接編集でき、整理、翻訳、執筆が便利になります。

具体的なコードは以下の通りです：

import requests
import random
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import scipdf
import string
from datetime import datetime

# カスタム例外、GROBIDサービスの例外処理用
class GROBID_OFFLINE_EXCEPTION(Exception):
    pass

class PDFToMarkdown:
    def __init__(self, input_path, grobid_urls=None):
        """
        PDFToMarkdownインスタンスを初期化します。

        Args:
            input_path (str): 処理するファイルまたはフォルダのパス。
            grobid_urls (list): オプション、GROBIDサーバーのURLリスト。デフォルトのURLリストを使用します。
        """
        self.input_path = input_path
        # カスタムGROBIDサーバーを使用、なければデフォルトサーバーを使用
        self.grobid_urls = grobid_urls if grobid_urls is not None else [
            "https://qingxu98-grobid.hf.space",
            "https://qingxu98-grobid2.hf.space",
            # ... (他のサーバーURL)
            "https://qingxu98-grobid8.hf.space",
        ]

    def get_avail_grobid_url(self):
        """利用可能なGROBIDサーバーのURLを取得します"""
        if not self.grobid_urls:
            return None

        while self.grobid_urls:
            _grobid_url = random.choice(self.grobid_urls)  # ランダムにGROBID URLを選択
            if _grobid_url.endswith('/'):
                _grobid_url = _grobid_url.rstrip('/')
            try:
                # サーバーがオンラインか確認
                res = requests.get(f"{_grobid_url}/api/isalive", timeout=5)
                if res.text == 'true':
                    return _grobid_url  # 利用可能なURLを返す
            except (requests.ConnectionError, requests.Timeout):
                # 接続エラーまたはタイムアウトの場合、このURLをリストから削除
                self.grobid_urls.remove(_grobid_url)
        return None  # 利用可能なサーバーがない場合、Noneを返す

    @staticmethod
    def dict_to_markdown(article_json):
        """記事の辞書をMarkdown形式の文字列に変換します"""
        markdown_lines = []
        markdown_lines.append(f"# {article_json.get('title', '無題')} \n")  # タイトル
        markdown_lines.append(f"> doi:{article_json.get('doi', '')} \n")  # DOI
        markdown_lines.append(f"+ authors\n{article_json.get('authors', ['無作者'])}  \n")  # 著者
        markdown_lines.append(f"+ abstract\n{article_json.get('abstract', '無要約')}  \n")  # 要約

        # 各章の内容を処理
        if 'sections' in article_json:
            for section in article_json['sections']:
                markdown_lines.append(f"+ {section['heading']}\n{section['text']}\n")  # 章のタイトルと内容

        return "\n".join(markdown_lines)  # 結合されたMarkdown文字列を返す

    @staticmethod
    def save_markdown_file(filename, content):
        """内容をMarkdownファイルに書き込みます"""
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(content)  # 内容をファイルに書き込む

    def parse_pdf(self, pdf_path, grobid_url):
        """単一のPDFファイルを解析し、記事の辞書を返します"""
        if not os.path.isfile(pdf_path):
            raise FileNotFoundError(f"指定されたパスにPDFファイルが見つかりません: {pdf_path}")  # ファイルの存在を確認

        if grobid_url.endswith('/'):
            grobid_url = grobid_url.rstrip('/')

        try:
            # GROBIDを使用してPDFを解析
            return scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url)
        except GROBID_OFFLINE_EXCEPTION:
            raise GROBID_OFFLINE_EXCEPTION("GROBIDサービスが利用できません。設定のGROBID_URLを確認してください。")
        except RuntimeError:
            raise RuntimeError("PDFの解析に失敗しました。PDFが破損していないか確認してください。")

    def process_pdf_file(self, pdf_path, grobid_url):
        """単一のPDFファイルを処理し、Markdown内容を返します"""
        print(f"解析中: {pdf_path}")
        try:
            pdf_article_dict = self.parse_pdf(pdf_path, grobid_url)  # PDFファイルを解析
            return self.dict_to_markdown(pdf_article_dict)  # Markdownに変換
        except Exception as e:
            print(f"ファイル {pdf_path} の処理中にエラーが発生しました: {e}")
            return None  # エラーが発生した場合はNoneを返す

    def process(self):
        """入力ファイルまたはフォルダを処理し、生成されたMarkdownファイルのパスを返します"""
        markdown_contents = []  # すべてのMarkdown内容を格納
        grobid_url = self.get_avail_grobid_url()

        if grobid_url is None:
            raise RuntimeError("利用可能なGROBIDサービスがありません。サーバー設定を確認してください。")

        # 入力パスに基づいてファイルかフォルダかを判断
        if os.path.isfile(self.input_path):
            pdf_files = [self.input_path]  # 単一ファイル
        elif os.path.isdir(self.input_path):
            # フォルダ内のすべてのPDFファイルを収集
            pdf_files = [os.path.join(dirpath, filename)
                         for dirpath, _, filenames in os.walk(self.input_path)
                         for filename in filenames if filename.endswith('.pdf')]
        else:
            raise ValueError("入力パスはファイルでもフォルダでもありません。")

        # スレッドプールを使用してPDFファイルを並行処理
        with ThreadPoolExecutor(max_workers=5) as executor:
            future_to_file = {executor.submit(self.process_pdf_file, pdf, grobid_url): pdf for pdf in pdf_files}

            # 生成されたMarkdown内容を収集
            for future in as_completed(future_to_file):
                result = future.result()
                if result:
                    markdown_contents.append(result)

        # 有効なMarkdown内容があれば、ファイルに保存
        if markdown_contents:
            # タイムスタンプを生成
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            # 2文字のランダムな文字を生成
            random_suffix = ''.join(random.choices(string.ascii_lowercase, k=2))
            output_filename = f"{timestamp}_{random_suffix}.md"
            self.save_markdown_file(output_filename, "\n\n".join(markdown_contents))  # 結合してMarkdownファイルとして保存
            print(f"すべてのMarkdownファイルが結合され、{output_filename}として保存されました")
            return output_filename  # 生成されたファイルのパスを返す
        else:
            print("有効なMarkdown内容が生成されませんでした。")
            return None


# このスクリプトを直接実行する場合、使用例を提供
if __name__ == "__main__":
    input_path = 'your_file_or_directory_path'  # あなたのファイルまたはディレクトリのパスに置き換えてください
    custom_grobid_urls = [
        "https://your-custom-grobid-server.com",
        "https://another-custom-grobid-server.com",
    ]
    pdf_to_markdown = PDFToMarkdown(input_path, grobid_urls=custom_grobid_urls)
    output_file = pdf_to_markdown.process()  # PDFファイルを処理してMarkdownファイルを生成
    print("生成されたファイルのパス:", output_file)  # 生成されたファイルのパスを出力

注意：以下の Python ライブラリをインストールする必要があります：

pip install git+https://github.com/titipata/scipdf_parser

使用時は、input_pathをあなたのファイルまたはディレクトリのパスに置き換えてください。このスクリプトはマルチスレッドで、フォルダ内のすべての PDF ファイルを処理できます。独自の GROBID サーバーがある場合は、custom_grobid_urlsリストに追加できます。そうでなければ、デフォルトの GROBID サーバーが使用されます。最終的には、すべての PDF ファイルの内容を含む Markdown ファイルが生成されます。

参考：