import os import pdftotext import docx2txt import pytesseract from PIL import Image def process_jpg(f): text = pytesseract.image_to_string(Image.open(f), lang="ces") return text def process_pdf(f): pdf = pdftotext.PDF(f) aggregate = "\n\n".join(pdf) return aggregate def process_docx(f): # TODO weird output with many spaces return docx2txt.process(f) def process_file(f): # TODO proper file format distinguishing, not only by suffix? _, ext = os.path.splitext(f.filename) print(ext) if ext == '.jpg': return process_jpg(f) elif ext == '.pdf': return process_pdf(f) elif ext == '.docx': return process_docx(f) def validate_court(lawsuit): pass def validate_accuser(lawsuit): pass # hard to implement def validate_topic(): pass # also hard to implement def validate_intent(): pass def validate_signature(): pass def validate_date(): pass def validate(text): pass