res-judicata/validator/validator.py

import os
import pdftotext
import docx2txt
import pytesseract
from PIL import Image
import regex as re
from typing import Union, Tuple, Literal
from pdf2image import convert_from_bytes

PDF_CHARACTER_THRESHOLD = 10

def process_jpg(f):
    text = pytesseract.image_to_string(Image.open(f), lang="ces")
    return text

def process_pdf(f):
    pdf = pdftotext.PDF(f)
    aggregate = "\n\n".join(pdf)
    count = 0
    for a in aggregate:
        if a.isalnum():
            count += 1
    if count > PDF_CHARACTER_THRESHOLD:
        return aggregate
    else:
        f.seek(0, 0)
        images = convert_from_bytes(f.read())
        text = []
        for image in images:
            text.append(pytesseract.image_to_string(image, lang="ces"))
        return "\n\n".join(text)

def process_docx(f):
    # TODO weird output with many spaces
    return docx2txt.process(f)


def process_file(f) -> str:
    # TODO proper file format distinguishing, not only by suffix?
    _, ext = os.path.splitext(f.filename)
    if ext == '.jpg':
        return process_jpg(f)
    elif ext == '.pdf':
        return process_pdf(f)
    elif ext == '.docx':
        return process_docx(f)
    else:
        return str(f.read())

court_pat = re.compile(r"\b(?:okresní|krajský|vrchní|nejvyšší(?:\s+správní\w*)|ústavní)\w*\s+soud\w*(?:\s+ve?)?\s+((?:\w|\s)+)", flags=re.IGNORECASE)
def validate_court(text_content: str) -> Union[Tuple[int, int], Literal[False]]:
    match = court_pat.search(text_content)
    if match is None:
        return False
    else:
        return match.span()

def validate_accuser(text_content) -> Union[Tuple[int, int], Literal[False]]:
    pass

# hard to implement
def validate_topic(text_content) -> Union[Tuple[int, int], Literal[False]]:
    pass

# also hard to implement
def validate_intent(text_content) -> Union[Tuple[int, int], Literal[False]]:
    pass

def validate_signature(text_content) -> Union[Tuple[int, int], Literal[False]]:
    pass

date_and_place_pat = re.compile(r"\bve?\s+[^\n]+(?:\s|[.,\-–—:])+(?:dne)?(?:\s|[.,\-–—:])+\d+\.", flags=re.IGNORECASE)
def validate_date_and_place(text_content: str) -> Union[Tuple[int, int], Literal[False]]:
    match = date_and_place_pat.search(text_content)
    if match is None:
        return False
    else:
        return match.span()

def validate(text_content: str) -> object:
    return {
        "checks": {
            "court": validate_court(text_content),
            "date_and_place": validate_date_and_place(text_content),
        },
        "parsed_content": text_content,
    }

# debug
if __name__ == "__main__":
    import sys
    print(validate(str(sys.stdin.read())))