res-judicata/validator/validator.py
František Kmječ 6d29d7e191 Add to_pdf
2021-01-23 10:54:25 +01:00

112 lines
3.3 KiB
Python

import os
import pdftotext
import docx2txt
import pytesseract
from PIL import Image
import regex as re
from typing import Union, Tuple, Literal
from pdf2image import convert_from_bytes
from geneeanlpclient import g3
PDF_CHARACTER_THRESHOLD = 10
def process_jpg(f):
text = pytesseract.image_to_string(Image.open(f), lang="ces")
return text
def process_pdf(f):
pdf = pdftotext.PDF(f)
aggregate = "\n\n".join(pdf)
count = 0
for a in aggregate:
if a.isalnum():
count += 1
if count > PDF_CHARACTER_THRESHOLD:
return aggregate
else:
f.seek(0, 0)
images = convert_from_bytes(f.read())
text = []
for image in images:
text.append(pytesseract.image_to_string(image, lang="ces"))
return "\n\n".join(text)
def process_docx(f):
# TODO weird output with many spaces
return docx2txt.process(f)
def process_file(f) -> str:
# TODO proper file format distinguishing, not only by suffix?
_, ext = os.path.splitext(f.filename)
if ext == '.jpg':
return process_jpg(f)
elif ext == '.pdf':
return process_pdf(f)
elif ext == '.docx':
return process_docx(f)
else:
return str(f.read())
court_pat = re.compile(r"\b(?:okresní|krajský|vrchní|nejvyšší(?:\s+správní\w*)|ústavní)\w*\s+soud\w*(?:\s+ve?)?\s+((?:\w|\s)+)", flags=re.IGNORECASE)
def validate_court(text_content: str) -> Union[Tuple[int, int], Literal[False]]:
match = court_pat.search(text_content)
if match is None:
return False
else:
return match.span()
def validate_accuser(text_content) -> Union[Tuple[int, int], Literal[False]]:
requestBuilder = g3.Request.Builder(analyses=[g3.AnalysisType.ALL])
with g3.Client.create(userKey='4330765d043bfd5366b04a20c18b2dc0') as analyzer:
result = analyzer.analyze(requestBuilder.build(id=str(1), text=text_content))
for e in result.entities:
print(f'\t{e.type}: {e.stdForm}')
for r in result.relations:
print(r)
for e in result.entities:
if e.stdForm == "žalobce":
pass
# hard to implement
def validate_topic(text_content) -> Union[Tuple[int, int], Literal[False]]:
return False
# also hard to implement
def validate_intent(text_content) -> Union[Tuple[int, int], Literal[False]]:
return False
def validate_signature(text_content) -> Union[Tuple[int, int], Literal[False]]:
return False
date_and_place_pat = re.compile(r"\bve?\s+[^\n]+(?:\s|[.,\-–—:])+(?:dne)?(?:\s|[.,\-–—:])+\d+\.", flags=re.IGNORECASE)
def validate_date_and_place(text_content: str) -> Union[Tuple[int, int], Literal[False]]:
match = date_and_place_pat.search(text_content)
if match is None:
return False
else:
return match.span()
def validate(text_content: str) -> object:
return {
"checks": {
"court": validate_court(text_content),
"date_and_place": validate_date_and_place(text_content),
"accuser": validate_accuser(text_content),
"topic": validate_topic(text_content),
"intent": validate_intent(text_content),
"signature": validate_signature(text_content),
},
"parsed_content": text_content,
}
# debug
if __name__ == "__main__":
import sys
print(validate(str(sys.stdin.read())))