Add tesseract text extraction from jpg

This commit is contained in:
František Kmječ 2021-01-22 23:00:22 +01:00
parent c51386f195
commit 8d58a4e4c2
2 changed files with 12 additions and 6 deletions

View File

@ -1,6 +1,7 @@
certifi==2020.12.5
chardet==4.0.0
click==7.1.2
docx2txt==0.8
Flask==1.1.2
geneea-nlp-client==1.2.0
idna==2.10
@ -10,7 +11,8 @@ joblib==1.0.0
MarkupSafe==1.1.1
nltk==3.5
pdftotext==2.1.5
PyPDF2==1.26.0
Pillow==8.1.0
pytesseract==0.3.7
regex==2020.11.13
requests==2.25.1
retrying==1.3.3

View File

@ -1,8 +1,12 @@
import os
import pdftotext
import docx2txt
import pytesseract
from PIL import Image
def process_jpg(f):
pass
text = pytesseract.image_to_string(Image.open(f), lang="ces")
return text
def process_pdf(f):
pdf = pdftotext.PDF(f)
@ -10,7 +14,9 @@ def process_pdf(f):
return aggregate
def process_docx(f):
pass
# TODO weird output with many spaces
return docx2txt.process(f)
def process_file(f):
# TODO proper file format distinguishing, not only by suffix?
@ -21,9 +27,7 @@ def process_file(f):
elif ext == '.pdf':
return process_pdf(f)
elif ext == '.docx':
return process_docx(f)
return process_docx(f)
def validate_court(lawsuit):
pass