diff --git a/validator/requirements.txt b/validator/requirements.txt index 986a06d..caba025 100644 --- a/validator/requirements.txt +++ b/validator/requirements.txt @@ -1,8 +1,10 @@ certifi==2020.12.5 chardet==4.0.0 click==7.1.2 +docx2pdf==0.1.7 docx2txt==0.8 Flask==1.1.2 +Flask-Cors==3.0.10 geneea-nlp-client==1.2.0 idna==2.10 itsdangerous==1.1.0 @@ -10,6 +12,7 @@ Jinja2==2.11.2 joblib==1.0.0 MarkupSafe==1.1.1 nltk==3.5 +pdf2image==1.14.0 pdftotext==2.1.5 Pillow==8.1.0 pytesseract==0.3.7 @@ -20,4 +23,3 @@ six==1.15.0 tqdm==4.56.0 urllib3==1.26.2 Werkzeug==1.0.1 -flask-cors==3.0.10 diff --git a/validator/server.py b/validator/server.py index 7a00487..0ffbf2d 100644 --- a/validator/server.py +++ b/validator/server.py @@ -1,6 +1,7 @@ -from flask import Flask, request, jsonify +from flask import Flask, request, jsonify, send_file from flask_cors import CORS import validator +from docx2pdf import convert app = Flask(__name__) CORS(app) @@ -14,5 +15,17 @@ def validate(): validation_result = validator.validate(text_content) return jsonify(validation_result) +@app.route('/to_pdf', methods=['POST']) +def convert_to_pdf(): + if request.method == 'POST': + raw_file = request.files['file'] + _, ext = os.path.splitext(f.filename) + if ext == ".docx": + return send_file(convert(raw_file)) + elif ext == ".pdf": + return send_file(raw_file) + else: + return "Bad file format", 400 + app.run() diff --git a/validator/validator.py b/validator/validator.py index e7816a9..131a8b9 100644 --- a/validator/validator.py +++ b/validator/validator.py @@ -6,6 +6,7 @@ from PIL import Image import regex as re from typing import Union, Tuple, Literal from pdf2image import convert_from_bytes +from geneeanlpclient import g3 PDF_CHARACTER_THRESHOLD = 10 @@ -56,7 +57,21 @@ def validate_court(text_content: str) -> Union[Tuple[int, int], Literal[False]]: return match.span() def validate_accuser(text_content) -> Union[Tuple[int, int], Literal[False]]: - return False + requestBuilder = g3.Request.Builder(analyses=[g3.AnalysisType.ALL]) + + with g3.Client.create(userKey='4330765d043bfd5366b04a20c18b2dc0') as analyzer: + result = analyzer.analyze(requestBuilder.build(id=str(1), text=text_content)) + + for e in result.entities: + print(f'\t{e.type}: {e.stdForm}') + + for r in result.relations: + print(r) + + for e in result.entities: + if e.stdForm == "žalobce": + pass + # hard to implement def validate_topic(text_content) -> Union[Tuple[int, int], Literal[False]]: