diff --git a/validator/requirements.txt b/validator/requirements.txt index f565733..fe30b93 100644 --- a/validator/requirements.txt +++ b/validator/requirements.txt @@ -9,6 +9,8 @@ Jinja2==2.11.2 joblib==1.0.0 MarkupSafe==1.1.1 nltk==3.5 +pdftotext==2.1.5 +PyPDF2==1.26.0 regex==2020.11.13 requests==2.25.1 retrying==1.3.3 diff --git a/validator/server.py b/validator/server.py index 46e4940..bad18ef 100644 --- a/validator/server.py +++ b/validator/server.py @@ -1,4 +1,4 @@ -from flask import Flask, render_template, request +from flask import Flask, render_template, request, jsonify import json import validator diff --git a/validator/validator.py b/validator/validator.py index f3dffe2..c4620c8 100644 --- a/validator/validator.py +++ b/validator/validator.py @@ -1,14 +1,29 @@ +import os +import pdftotext + def process_jpg(f): pass def process_pdf(f): - pass + pdf = pdftotext.PDF(f) + aggregate = "\n\n".join(pdf) + return aggregate def process_docx(f): pass def process_file(f): - pass + # TODO proper file format distinguishing, not only by suffix? + _, ext = os.path.splitext(f.filename) + print(ext) + if ext == '.jpg': + return process_jpg(f) + elif ext == '.pdf': + return process_pdf(f) + elif ext == '.docx': + return process_docx(f) + + def validate_court(lawsuit): pass