In [None]:
import os
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4

def create_title_page(title, output_filename):
    """
    Creates a PDF title page with the given title in A4 size.
    """
    c = canvas.Canvas(output_filename, pagesize=A4)
    width, height = A4
    title = title.replace('_',' ')
    c.setFont('Helvetica', 12)
    c.drawCentredString(width / 2.0, height / 2.0, title)
    c.save()

def merge_pdfs_with_title_pages(folder_path, output_filename='merged_with_titles_A4.pdf'):
    pdf_writer = PdfWriter()

    # Sort files based on the three leftmost characters before the file extension
    pdf_files = sorted(filter(lambda x: x.endswith('.pdf'), os.listdir(folder_path)),
                       key=lambda x: x[0:3])

    for item in pdf_files:
        # Create a title page for the current PDF in A4 size
        title_page_filename = 'title_page.pdf'
        create_title_page(item, title_page_filename)
        
        # Add the title page to the merged PDF
        with open(title_page_filename, 'rb') as title_page_file:
            title_page_pdf = PdfReader(title_page_file)
            for page_num in range(len(title_page_pdf.pages)):
                page = title_page_pdf.pages[page_num]
                pdf_writer.add_page(page)

        # Now add the actual PDF
        pdf_path = os.path.join(folder_path, item)
        pdf_reader = PdfReader(pdf_path)
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            pdf_writer.add_page(page)
    
    output_filename = "rki_protolle_merged.pdf"

    # Save the merged PDF with title pages in A4 size
    with open(output_filename, 'wb') as out:
        pdf_writer.write(out)

    print(f"Merged PDF saved as '{output_filename}'.")

# Example usage
folder_path = '/Users/jan/Nextcloud2/hr-DDJ/projekte/covid_rki_gpts/HiDrive/'  # Update this to the path of your PDF files
merge_pdfs_with_title_pages(folder_path)


# PDFs sind nicht ideal

ChatGPT ist nicht in der Lage, das erzeugte PDF korrekt auszuwerten - Worte werden mit massiven Tippfehlern ausgelesen, was dazu führt, dass die Protokolldaten falsch identifiziert werden: Das Protokoll 2020-03-17 (ab Seite 345) wird einfach nicht gefunden. 

In [3]:
import pdfplumber
import os
from markdownify import markdownify as md
import re

# Was ersetzt oder rausgeworfen werden soll
replacement_dict = {
    'VS - NUR FÜR DEN DIENSTGEBRAUCH Einstufung aufgehoben am 11.01.2023 durch VPräs': '',
    '\nErgebnisprotokoll': '\n## ERGEBNISPROTOKOLL'
}

def pdf_strip(text):
  for pattern, new_str in replacement_dict.items():
  #      text = re.sub(pattern, new_str, text)
     text = text.replace(pattern, new_str)
  return text

# pdfminer.six import
# taken from https://medium.com/social-impact-analytics/comparing-4-methods-for-pdf-text-extraction-in-python-fd34531034f
def pdf_minersix_to_txt(path):
    from io import StringIO
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfparser import PDFParser

    output_string = StringIO()
    with open(path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        p = 0
        for page in PDFPage.create_pages(doc):
            print(f"Processing page {p}...")
            interpreter.process_page(page)
            p += 1
    text = str(output_string.getvalue())
    return text

def pdf_plumber_to_text(path):
  with pdfplumber.open(path) as pdf:
        text = ""
        p = 1
        for page in pdf.pages:
            print(f"Converting page {p}")
            # Get the page's text, ignoring headers and footers
            truncated_text = re.sub(r'(?i)(?:^|\n)\s*(?:header|footer):', '', page.extract_text()) 
            text += pdf_strip(truncated_text) + "\n"
            p += 1
  return text

def convert_pdf_to_md(file, output_filename):


  print(f"Merged PDF saved as '{output_filename}'.")

folder_path = '/Users/jan/Nextcloud2/hr-DDJ/projekte/covid_rki_gpts/HiDrive/'
pdf_files = sorted(filter(lambda x: x.endswith('.pdf'), os.listdir(folder_path)),
                       key=lambda x: x[0:3])

os.chdir(folder_path) 
text = ""
for item in pdf_files:
  # Get just the filename without extension from path
  base_name = os.path.splitext(item)[0]
  
  # Convert with pdfminer.six to MD file
  # 
  single_text = pdf_minersix_to_txt(item)
  # Add header for each document, using file name
  text += "# "+ base_name.replace('_',' ')
  text += pdf_strip(single_text)

output_filename = "../rki_merged.md"            
with open(output_filename, 'w', encoding='utf-8') as out:
  out.write(text)
print(f'Created file {output_filename}...done.')


Processing page 0...
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 0...
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 0...
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 0...
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 0...
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 0...
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 0...
Processing page 1...
Processing page 2...
Processing page 0...
Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 0...
Processing pa