diff --git a/flake.nix b/flake.nix index b345f8a..bcc5d50 100644 --- a/flake.nix +++ b/flake.nix @@ -15,5 +15,26 @@ ''; }; + packages.x86_64-linux.validator-dev = + let + geneea_sdk = pkgs.python38Packages.buildPythonPackage rec { + pname = "geneea-nlp-client"; + version = "1.2.0"; + src = pkgs.python38Packages.fetchPypi { + inherit pname version; + sha256 = "Q0fYD3V0NbUOItpCwA6ExIy7sIcQxpfqr1aPSXd4+cc="; + }; + propagatedBuildInputs = with pkgs.python38Packages; [ requests retrying ]; + doCheck = false; + }; + in + pkgs.mkShell { + buildInputs = [ pkgs.python38 pkgs.python38Packages.pip geneea_sdk ]; + shellHook = '' + read -p 'Insert Geneea API key:' -r -s geneea_api_key + export geneea_api_key + ''; + }; + }; } diff --git a/validator/.gitignore b/validator/.gitignore new file mode 100644 index 0000000..06a47c5 --- /dev/null +++ b/validator/.gitignore @@ -0,0 +1,2 @@ +env/ +__pycache__/ diff --git a/validator/geneea.py b/validator/geneea.py new file mode 100755 index 0000000..cbc61ae --- /dev/null +++ b/validator/geneea.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +import os +from geneeanlpclient import g3 + + +def getKey() -> str: + key = os.getenv("geneea_api_key") + if key is None: + raise ValueError("$geneea_api_key env var was not set") + return key + + +def doTheTest(): + builder = g3.Request.Builder(analyses=[g3.AnalysisType.ALL], returnMentions=True, returnItemSentiment=True, domain=g3.Domain.MEDIA, textType="Business", referenceDate="2015-03-14", diacritization="redo", language="cs") + with g3.Client.create(userKey=getKey()) as analyzer: + result = analyzer.analyze( + builder.build(id=str(1), text='Koláč s rozinkami panu Karlovi moc chutnal.', language="cs")) + + # print(result) + + # for t in result.tokens: + # print(f'{t} – lemma "{t.lemma}"') + + for r in result.relations: + # print(r) + print(f'{r.type} {r.textRepr} - {r.name}') + + +if __name__ == '__main__': + doTheTest() diff --git a/validator/requirements.txt b/validator/requirements.txt new file mode 100644 index 0000000..cb7a7d2 --- /dev/null +++ b/validator/requirements.txt @@ -0,0 +1,22 @@ +certifi==2020.12.5 +chardet==4.0.0 +click==7.1.2 +docx2txt==0.8 +Flask==1.1.2 +geneea-nlp-client==1.2.0 +idna==2.10 +itsdangerous==1.1.0 +Jinja2==2.11.2 +joblib==1.0.0 +MarkupSafe==1.1.1 +nltk==3.5 +pdftotext==2.1.5 +Pillow==8.1.0 +pytesseract==0.3.7 +regex==2020.11.13 +requests==2.25.1 +retrying==1.3.3 +six==1.15.0 +tqdm==4.56.0 +urllib3==1.26.2 +Werkzeug==1.0.1 diff --git a/validator/server.py b/validator/server.py new file mode 100644 index 0000000..bad18ef --- /dev/null +++ b/validator/server.py @@ -0,0 +1,24 @@ +from flask import Flask, render_template, request, jsonify +import json +import validator + +app = Flask(__name__) + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/validator', methods=['GET', 'POST']) +def validate(): + if request.method == 'POST': + print(request.files) + f = request.files['file'] + + text = validator.process_file(f) + result = validator.validate(text) + return jsonify(result) + else: + return 'Soubor byl zvalidován. TODO musím ověřit, jak byl zvalidován.' # TODO change + + +app.run() \ No newline at end of file diff --git a/validator/templates/index.html b/validator/templates/index.html new file mode 100644 index 0000000..aeb843c --- /dev/null +++ b/validator/templates/index.html @@ -0,0 +1,9 @@ + +
+ + + \ No newline at end of file diff --git a/validator/validator.py b/validator/validator.py new file mode 100644 index 0000000..0047789 --- /dev/null +++ b/validator/validator.py @@ -0,0 +1,53 @@ +import os +import pdftotext +import docx2txt +import pytesseract +from PIL import Image + +def process_jpg(f): + text = pytesseract.image_to_string(Image.open(f), lang="ces") + return text + +def process_pdf(f): + pdf = pdftotext.PDF(f) + aggregate = "\n\n".join(pdf) + return aggregate + +def process_docx(f): + # TODO weird output with many spaces + return docx2txt.process(f) + + +def process_file(f): + # TODO proper file format distinguishing, not only by suffix? + _, ext = os.path.splitext(f.filename) + print(ext) + if ext == '.jpg': + return process_jpg(f) + elif ext == '.pdf': + return process_pdf(f) + elif ext == '.docx': + return process_docx(f) + +def validate_court(lawsuit): + pass + +def validate_accuser(lawsuit): + pass + +# hard to implement +def validate_topic(): + pass + +# also hard to implement +def validate_intent(): + pass + +def validate_signature(): + pass + +def validate_date(): + pass + +def validate(text): + pass \ No newline at end of file