import os import pdftotext import docx2txt import pytesseract from PIL import Image import regex as re from typing import Union, Tuple, Literal from pdf2image import convert_from_bytes from geneeanlpclient import g3 PDF_CHARACTER_THRESHOLD = 10 def process_jpg(f): text = pytesseract.image_to_string(Image.open(f), lang="ces") return text def process_pdf(f): pdf = pdftotext.PDF(f) aggregate = "\n\n".join(pdf) count = 0 for a in aggregate: if a.isalnum(): count += 1 if count > PDF_CHARACTER_THRESHOLD: return aggregate else: f.seek(0, 0) images = convert_from_bytes(f.read()) text = [] for image in images: text.append(pytesseract.image_to_string(image, lang="ces")) return "\n\n".join(text) def process_docx(f): # TODO weird output with many spaces return docx2txt.process(f) def process_file(f) -> str: # TODO proper file format distinguishing, not only by suffix? _, ext = os.path.splitext(f.filename) if ext == '.jpg': return process_jpg(f) elif ext == '.pdf': return process_pdf(f) elif ext == '.docx': return process_docx(f) else: return str(f.read()) court_pat = re.compile(r"\b(?:okresní|krajský|vrchní|nejvyšší(?:\s+správní\w*)|ústavní)\w*\s+soud\w*(?:\s+ve?)?\s+((?:\w|\s)+)", flags=re.IGNORECASE) def validate_court(text_content: str) -> Union[Tuple[int, int], Literal[False]]: match = court_pat.search(text_content) if match is None: return False else: return match.span() def validate_accuser(text_content) -> Union[Tuple[int, int], Literal[False]]: requestBuilder = g3.Request.Builder(analyses=[g3.AnalysisType.ALL]) with g3.Client.create(userKey='4330765d043bfd5366b04a20c18b2dc0') as analyzer: result = analyzer.analyze(requestBuilder.build(id=str(1), text=text_content)) for e in result.entities: print(f'\t{e.type}: {e.stdForm}') for r in result.relations: print(r) for e in result.entities: if e.stdForm == "žalobce": pass # hard to implement def validate_topic(text_content) -> Union[Tuple[int, int], Literal[False]]: return False # also hard to implement def validate_intent(text_content) -> Union[Tuple[int, int], Literal[False]]: return False def validate_signature(text_content) -> Union[Tuple[int, int], Literal[False]]: return False date_and_place_pat = re.compile(r"\bve?\s+[^\n]+(?:\s|[.,\-–—:])+(?:dne)?(?:\s|[.,\-–—:])+\d+\.", flags=re.IGNORECASE) def validate_date_and_place(text_content: str) -> Union[Tuple[int, int], Literal[False]]: match = date_and_place_pat.search(text_content) if match is None: return False else: return match.span() def validate(text_content: str) -> object: return { "checks": { "court": validate_court(text_content), "date_and_place": validate_date_and_place(text_content), "accuser": validate_accuser(text_content), "topic": validate_topic(text_content), "intent": validate_intent(text_content), "signature": validate_signature(text_content), }, "parsed_content": text_content, } # debug if __name__ == "__main__": import sys print(validate(str(sys.stdin.read())))