61 lines
1.2 KiB
Python
61 lines
1.2 KiB
Python
import os
|
|
import pdftotext
|
|
import docx2txt
|
|
import pytesseract
|
|
from PIL import Image
|
|
import regex as re
|
|
|
|
def process_jpg(f):
|
|
text = pytesseract.image_to_string(Image.open(f), lang="ces")
|
|
return text
|
|
|
|
def process_pdf(f):
|
|
pdf = pdftotext.PDF(f)
|
|
aggregate = "\n\n".join(pdf)
|
|
return aggregate
|
|
|
|
def process_docx(f):
|
|
# TODO weird output with many spaces
|
|
return docx2txt.process(f)
|
|
|
|
|
|
def process_file(f):
|
|
# TODO proper file format distinguishing, not only by suffix?
|
|
_, ext = os.path.splitext(f.filename)
|
|
print(ext)
|
|
if ext == '.jpg':
|
|
return process_jpg(f)
|
|
elif ext == '.pdf':
|
|
return process_pdf(f)
|
|
elif ext == '.docx':
|
|
return process_docx(f)
|
|
|
|
court_pat = re.compile(r"(okresní|krajský|vrchní|nejvyšší(\s+správní\S*)|ústavní)\S*\s+soud\S*(\s+ve?)?\s+(\S+)", flags=re.IGNORECASE)
|
|
def validate_court(lawsuit: str):
|
|
return court_pat.search(lawsuit)
|
|
|
|
def validate_accuser(lawsuit):
|
|
pass
|
|
|
|
# hard to implement
|
|
def validate_topic():
|
|
pass
|
|
|
|
# also hard to implement
|
|
def validate_intent():
|
|
pass
|
|
|
|
def validate_signature():
|
|
pass
|
|
|
|
def validate_date():
|
|
pass
|
|
|
|
def validate(text):
|
|
pass
|
|
|
|
# debug
|
|
if __name__ == "__main__":
|
|
import sys
|
|
print(validate_court(str(sys.stdin.read())))
|