diff --git a/validator/requirements.txt b/validator/requirements.txt index fe30b93..cb7a7d2 100644 --- a/validator/requirements.txt +++ b/validator/requirements.txt @@ -1,6 +1,7 @@ certifi==2020.12.5 chardet==4.0.0 click==7.1.2 +docx2txt==0.8 Flask==1.1.2 geneea-nlp-client==1.2.0 idna==2.10 @@ -10,7 +11,8 @@ joblib==1.0.0 MarkupSafe==1.1.1 nltk==3.5 pdftotext==2.1.5 -PyPDF2==1.26.0 +Pillow==8.1.0 +pytesseract==0.3.7 regex==2020.11.13 requests==2.25.1 retrying==1.3.3 diff --git a/validator/validator.py b/validator/validator.py index c4620c8..0047789 100644 --- a/validator/validator.py +++ b/validator/validator.py @@ -1,8 +1,12 @@ import os import pdftotext +import docx2txt +import pytesseract +from PIL import Image def process_jpg(f): - pass + text = pytesseract.image_to_string(Image.open(f), lang="ces") + return text def process_pdf(f): pdf = pdftotext.PDF(f) @@ -10,7 +14,9 @@ def process_pdf(f): return aggregate def process_docx(f): - pass + # TODO weird output with many spaces + return docx2txt.process(f) + def process_file(f): # TODO proper file format distinguishing, not only by suffix? @@ -21,9 +27,7 @@ def process_file(f): elif ext == '.pdf': return process_pdf(f) elif ext == '.docx': - return process_docx(f) - - + return process_docx(f) def validate_court(lawsuit): pass