res-judicata/validator/validator.py

61 lines
1.2 KiB
Python

import os
import pdftotext
import docx2txt
import pytesseract
from PIL import Image
import regex as re
def process_jpg(f):
text = pytesseract.image_to_string(Image.open(f), lang="ces")
return text
def process_pdf(f):
pdf = pdftotext.PDF(f)
aggregate = "\n\n".join(pdf)
return aggregate
def process_docx(f):
# TODO weird output with many spaces
return docx2txt.process(f)
def process_file(f):
# TODO proper file format distinguishing, not only by suffix?
_, ext = os.path.splitext(f.filename)
print(ext)
if ext == '.jpg':
return process_jpg(f)
elif ext == '.pdf':
return process_pdf(f)
elif ext == '.docx':
return process_docx(f)
court_pat = re.compile(r"(okresní|krajský|vrchní|nejvyšší(\s+správní\S*)|ústavní)\S*\s+soud\S*(\s+ve?)?\s+(\S+)", flags=re.IGNORECASE)
def validate_court(lawsuit: str):
return court_pat.search(lawsuit)
def validate_accuser(lawsuit):
pass
# hard to implement
def validate_topic():
pass
# also hard to implement
def validate_intent():
pass
def validate_signature():
pass
def validate_date():
pass
def validate(text):
pass
# debug
if __name__ == "__main__":
import sys
print(validate_court(str(sys.stdin.read())))