Add binary pdf loading
This commit is contained in:
parent
8c26a6c3cc
commit
b9982b6d0b
@ -20,4 +20,4 @@ six==1.15.0
|
|||||||
tqdm==4.56.0
|
tqdm==4.56.0
|
||||||
urllib3==1.26.2
|
urllib3==1.26.2
|
||||||
Werkzeug==1.0.1
|
Werkzeug==1.0.1
|
||||||
flask-cors=3.0.10
|
flask-cors==3.0.10
|
||||||
|
@ -5,6 +5,9 @@ import pytesseract
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
import regex as re
|
import regex as re
|
||||||
from typing import Union, Tuple, Literal
|
from typing import Union, Tuple, Literal
|
||||||
|
from pdf2image import convert_from_bytes
|
||||||
|
|
||||||
|
PDF_CHARACTER_THRESHOLD = 10
|
||||||
|
|
||||||
def process_jpg(f):
|
def process_jpg(f):
|
||||||
text = pytesseract.image_to_string(Image.open(f), lang="ces")
|
text = pytesseract.image_to_string(Image.open(f), lang="ces")
|
||||||
@ -13,7 +16,19 @@ def process_jpg(f):
|
|||||||
def process_pdf(f):
|
def process_pdf(f):
|
||||||
pdf = pdftotext.PDF(f)
|
pdf = pdftotext.PDF(f)
|
||||||
aggregate = "\n\n".join(pdf)
|
aggregate = "\n\n".join(pdf)
|
||||||
return aggregate
|
count = 0
|
||||||
|
for a in aggregate:
|
||||||
|
if a.isalnum():
|
||||||
|
count += 1
|
||||||
|
if count > PDF_CHARACTER_THRESHOLD:
|
||||||
|
return aggregate
|
||||||
|
else:
|
||||||
|
f.seek(0, 0)
|
||||||
|
images = convert_from_bytes(f.read())
|
||||||
|
text = []
|
||||||
|
for image in images:
|
||||||
|
text.append(pytesseract.image_to_string(image, lang="ces"))
|
||||||
|
return "\n\n".join(text)
|
||||||
|
|
||||||
def process_docx(f):
|
def process_docx(f):
|
||||||
# TODO weird output with many spaces
|
# TODO weird output with many spaces
|
||||||
|
Loading…
Reference in New Issue
Block a user