res-judicata/validator/validator.py
2021-01-23 09:21:19 +01:00

93 lines
2.6 KiB
Python

import os
import pdftotext
import docx2txt
import pytesseract
from PIL import Image
import regex as re
from typing import Union, Tuple, Literal
from pdf2image import convert_from_bytes
PDF_CHARACTER_THRESHOLD = 10
def process_jpg(f):
text = pytesseract.image_to_string(Image.open(f), lang="ces")
return text
def process_pdf(f):
pdf = pdftotext.PDF(f)
aggregate = "\n\n".join(pdf)
count = 0
for a in aggregate:
if a.isalnum():
count += 1
if count > PDF_CHARACTER_THRESHOLD:
return aggregate
else:
f.seek(0, 0)
images = convert_from_bytes(f.read())
text = []
for image in images:
text.append(pytesseract.image_to_string(image, lang="ces"))
return "\n\n".join(text)
def process_docx(f):
# TODO weird output with many spaces
return docx2txt.process(f)
def process_file(f) -> str:
# TODO proper file format distinguishing, not only by suffix?
_, ext = os.path.splitext(f.filename)
if ext == '.jpg':
return process_jpg(f)
elif ext == '.pdf':
return process_pdf(f)
elif ext == '.docx':
return process_docx(f)
else:
return str(f.read())
court_pat = re.compile(r"\b(?:okresní|krajský|vrchní|nejvyšší(?:\s+správní\w*)|ústavní)\w*\s+soud\w*(?:\s+ve?)?\s+((?:\w|\s)+)", flags=re.IGNORECASE)
def validate_court(text_content: str) -> Union[Tuple[int, int], Literal[False]]:
match = court_pat.search(text_content)
if match is None:
return False
else:
return match.span()
def validate_accuser(text_content) -> Union[Tuple[int, int], Literal[False]]:
pass
# hard to implement
def validate_topic(text_content) -> Union[Tuple[int, int], Literal[False]]:
pass
# also hard to implement
def validate_intent(text_content) -> Union[Tuple[int, int], Literal[False]]:
pass
def validate_signature(text_content) -> Union[Tuple[int, int], Literal[False]]:
pass
date_and_place_pat = re.compile(r"\bve?\s+[^\n]+(?:\s|[.,\-–—:])+(?:dne)?(?:\s|[.,\-–—:])+\d+\.", flags=re.IGNORECASE)
def validate_date_and_place(text_content: str) -> Union[Tuple[int, int], Literal[False]]:
match = date_and_place_pat.search(text_content)
if match is None:
return False
else:
return match.span()
def validate(text_content: str) -> object:
return {
"checks": {
"court": validate_court(text_content),
"date_and_place": validate_date_and_place(text_content),
},
"parsed_content": text_content,
}
# debug
if __name__ == "__main__":
import sys
print(validate(str(sys.stdin.read())))