import os import pdftotext import docx2txt import pytesseract from PIL import Image import regex as re def process_jpg(f): text = pytesseract.image_to_string(Image.open(f), lang="ces") return text def process_pdf(f): pdf = pdftotext.PDF(f) aggregate = "\n\n".join(pdf) return aggregate def process_docx(f): # TODO weird output with many spaces return docx2txt.process(f) def process_file(f): # TODO proper file format distinguishing, not only by suffix? _, ext = os.path.splitext(f.filename) print(ext) if ext == '.jpg': return process_jpg(f) elif ext == '.pdf': return process_pdf(f) elif ext == '.docx': return process_docx(f) court_pat = re.compile(r"(okresní|krajský|vrchní|nejvyšší(\s+správní\S*)|ústavní)\S*\s+soud\S*(\s+ve?)?\s+(\S+)", flags=re.IGNORECASE) def validate_court(lawsuit: str): return court_pat.search(lawsuit) def validate_accuser(lawsuit): pass # hard to implement def validate_topic(): pass # also hard to implement def validate_intent(): pass def validate_signature(): pass def validate_date(): pass def validate(text): pass # debug if __name__ == "__main__": import sys print(validate_court(str(sys.stdin.read())))