Add tesseract text extraction from jpg
This commit is contained in:
parent
c51386f195
commit
8d58a4e4c2
@ -1,6 +1,7 @@
|
|||||||
certifi==2020.12.5
|
certifi==2020.12.5
|
||||||
chardet==4.0.0
|
chardet==4.0.0
|
||||||
click==7.1.2
|
click==7.1.2
|
||||||
|
docx2txt==0.8
|
||||||
Flask==1.1.2
|
Flask==1.1.2
|
||||||
geneea-nlp-client==1.2.0
|
geneea-nlp-client==1.2.0
|
||||||
idna==2.10
|
idna==2.10
|
||||||
@ -10,7 +11,8 @@ joblib==1.0.0
|
|||||||
MarkupSafe==1.1.1
|
MarkupSafe==1.1.1
|
||||||
nltk==3.5
|
nltk==3.5
|
||||||
pdftotext==2.1.5
|
pdftotext==2.1.5
|
||||||
PyPDF2==1.26.0
|
Pillow==8.1.0
|
||||||
|
pytesseract==0.3.7
|
||||||
regex==2020.11.13
|
regex==2020.11.13
|
||||||
requests==2.25.1
|
requests==2.25.1
|
||||||
retrying==1.3.3
|
retrying==1.3.3
|
||||||
|
@ -1,8 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
import pdftotext
|
import pdftotext
|
||||||
|
import docx2txt
|
||||||
|
import pytesseract
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
def process_jpg(f):
|
def process_jpg(f):
|
||||||
pass
|
text = pytesseract.image_to_string(Image.open(f), lang="ces")
|
||||||
|
return text
|
||||||
|
|
||||||
def process_pdf(f):
|
def process_pdf(f):
|
||||||
pdf = pdftotext.PDF(f)
|
pdf = pdftotext.PDF(f)
|
||||||
@ -10,7 +14,9 @@ def process_pdf(f):
|
|||||||
return aggregate
|
return aggregate
|
||||||
|
|
||||||
def process_docx(f):
|
def process_docx(f):
|
||||||
pass
|
# TODO weird output with many spaces
|
||||||
|
return docx2txt.process(f)
|
||||||
|
|
||||||
|
|
||||||
def process_file(f):
|
def process_file(f):
|
||||||
# TODO proper file format distinguishing, not only by suffix?
|
# TODO proper file format distinguishing, not only by suffix?
|
||||||
@ -23,8 +29,6 @@ def process_file(f):
|
|||||||
elif ext == '.docx':
|
elif ext == '.docx':
|
||||||
return process_docx(f)
|
return process_docx(f)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def validate_court(lawsuit):
|
def validate_court(lawsuit):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user