Add tesseract text extraction from jpg

2021-01-22 23:00:22 +01:00 · 2021-01-22 23:00:22 +01:00 · 8d58a4e4c2
commit 8d58a4e4c2
parent c51386f195
2 changed files with 12 additions and 6 deletions
--- a/validator/requirements.txt
+++ b/validator/requirements.txt
@ -1,6 +1,7 @@
 certifi==2020.12.5
 chardet==4.0.0
 click==7.1.2
+docx2txt==0.8
 Flask==1.1.2
 geneea-nlp-client==1.2.0
 idna==2.10
@ -10,7 +11,8 @@ joblib==1.0.0
 MarkupSafe==1.1.1
 nltk==3.5
 pdftotext==2.1.5
-PyPDF2==1.26.0
+Pillow==8.1.0
+pytesseract==0.3.7
 regex==2020.11.13
 requests==2.25.1
 retrying==1.3.3
--- a/validator/validator.py
+++ b/validator/validator.py
@ -1,8 +1,12 @@
 import os
 import pdftotext
+import docx2txt
+import pytesseract
+from PIL import Image

 def process_jpg(f):
-    pass
+    text = pytesseract.image_to_string(Image.open(f), lang="ces")
+    return text

 def process_pdf(f):
    pdf = pdftotext.PDF(f)
@ -10,7 +14,9 @@ def process_pdf(f):
    return aggregate

 def process_docx(f):
-    pass
+    # TODO weird output with many spaces
+    return docx2txt.process(f)
+    

 def process_file(f):
    # TODO proper file format distinguishing, not only by suffix?
@ -21,9 +27,7 @@ def process_file(f):
    elif ext == '.pdf':
        return process_pdf(f)
    elif ext == '.docx':
-        return process_docx(f)
-
-    
+        return process_docx(f)    

 def validate_court(lawsuit):
    pass