93 lines
2.6 KiB
Python
93 lines
2.6 KiB
Python
import os
|
|
import pdftotext
|
|
import docx2txt
|
|
import pytesseract
|
|
from PIL import Image
|
|
import regex as re
|
|
from typing import Union, Tuple, Literal
|
|
from pdf2image import convert_from_bytes
|
|
|
|
PDF_CHARACTER_THRESHOLD = 10
|
|
|
|
def process_jpg(f):
|
|
text = pytesseract.image_to_string(Image.open(f), lang="ces")
|
|
return text
|
|
|
|
def process_pdf(f):
|
|
pdf = pdftotext.PDF(f)
|
|
aggregate = "\n\n".join(pdf)
|
|
count = 0
|
|
for a in aggregate:
|
|
if a.isalnum():
|
|
count += 1
|
|
if count > PDF_CHARACTER_THRESHOLD:
|
|
return aggregate
|
|
else:
|
|
f.seek(0, 0)
|
|
images = convert_from_bytes(f.read())
|
|
text = []
|
|
for image in images:
|
|
text.append(pytesseract.image_to_string(image, lang="ces"))
|
|
return "\n\n".join(text)
|
|
|
|
def process_docx(f):
|
|
# TODO weird output with many spaces
|
|
return docx2txt.process(f)
|
|
|
|
|
|
def process_file(f) -> str:
|
|
# TODO proper file format distinguishing, not only by suffix?
|
|
_, ext = os.path.splitext(f.filename)
|
|
if ext == '.jpg':
|
|
return process_jpg(f)
|
|
elif ext == '.pdf':
|
|
return process_pdf(f)
|
|
elif ext == '.docx':
|
|
return process_docx(f)
|
|
else:
|
|
return str(f.read())
|
|
|
|
court_pat = re.compile(r"\b(?:okresní|krajský|vrchní|nejvyšší(?:\s+správní\w*)|ústavní)\w*\s+soud\w*(?:\s+ve?)?\s+((?:\w|\s)+)", flags=re.IGNORECASE)
|
|
def validate_court(text_content: str) -> Union[Tuple[int, int], Literal[False]]:
|
|
match = court_pat.search(text_content)
|
|
if match is None:
|
|
return False
|
|
else:
|
|
return match.span()
|
|
|
|
def validate_accuser(text_content) -> Union[Tuple[int, int], Literal[False]]:
|
|
pass
|
|
|
|
# hard to implement
|
|
def validate_topic(text_content) -> Union[Tuple[int, int], Literal[False]]:
|
|
pass
|
|
|
|
# also hard to implement
|
|
def validate_intent(text_content) -> Union[Tuple[int, int], Literal[False]]:
|
|
pass
|
|
|
|
def validate_signature(text_content) -> Union[Tuple[int, int], Literal[False]]:
|
|
pass
|
|
|
|
date_and_place_pat = re.compile(r"\bve?\s+[^\n]+(?:\s|[.,\-–—:])+(?:dne)?(?:\s|[.,\-–—:])+\d+\.", flags=re.IGNORECASE)
|
|
def validate_date_and_place(text_content: str) -> Union[Tuple[int, int], Literal[False]]:
|
|
match = date_and_place_pat.search(text_content)
|
|
if match is None:
|
|
return False
|
|
else:
|
|
return match.span()
|
|
|
|
def validate(text_content: str) -> object:
|
|
return {
|
|
"checks": {
|
|
"court": validate_court(text_content),
|
|
"date_and_place": validate_date_and_place(text_content),
|
|
},
|
|
"parsed_content": text_content,
|
|
}
|
|
|
|
# debug
|
|
if __name__ == "__main__":
|
|
import sys
|
|
print(validate(str(sys.stdin.read())))
|