Add tesseract text extraction from jpg
This commit is contained in:
parent
c51386f195
commit
8d58a4e4c2
@ -1,6 +1,7 @@
|
||||
certifi==2020.12.5
|
||||
chardet==4.0.0
|
||||
click==7.1.2
|
||||
docx2txt==0.8
|
||||
Flask==1.1.2
|
||||
geneea-nlp-client==1.2.0
|
||||
idna==2.10
|
||||
@ -10,7 +11,8 @@ joblib==1.0.0
|
||||
MarkupSafe==1.1.1
|
||||
nltk==3.5
|
||||
pdftotext==2.1.5
|
||||
PyPDF2==1.26.0
|
||||
Pillow==8.1.0
|
||||
pytesseract==0.3.7
|
||||
regex==2020.11.13
|
||||
requests==2.25.1
|
||||
retrying==1.3.3
|
||||
|
@ -1,8 +1,12 @@
|
||||
import os
|
||||
import pdftotext
|
||||
import docx2txt
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
def process_jpg(f):
|
||||
pass
|
||||
text = pytesseract.image_to_string(Image.open(f), lang="ces")
|
||||
return text
|
||||
|
||||
def process_pdf(f):
|
||||
pdf = pdftotext.PDF(f)
|
||||
@ -10,7 +14,9 @@ def process_pdf(f):
|
||||
return aggregate
|
||||
|
||||
def process_docx(f):
|
||||
pass
|
||||
# TODO weird output with many spaces
|
||||
return docx2txt.process(f)
|
||||
|
||||
|
||||
def process_file(f):
|
||||
# TODO proper file format distinguishing, not only by suffix?
|
||||
@ -21,9 +27,7 @@ def process_file(f):
|
||||
elif ext == '.pdf':
|
||||
return process_pdf(f)
|
||||
elif ext == '.docx':
|
||||
return process_docx(f)
|
||||
|
||||
|
||||
return process_docx(f)
|
||||
|
||||
def validate_court(lawsuit):
|
||||
pass
|
||||
|
Loading…
Reference in New Issue
Block a user