From 697165fb632f1add088e7afa0dd44a0bb798ddfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franti=C5=A1ek=20Kmje=C4=8D?= Date: Fri, 22 Jan 2021 20:51:02 +0100 Subject: [PATCH 1/6] Create skeleton --- validator/.gitignore | 1 + validator/requirements.txt | 18 ++++++++++++++++++ validator/server.py | 10 ++++++++++ validator/validator.py | 31 +++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+) create mode 100644 validator/.gitignore create mode 100644 validator/requirements.txt create mode 100644 validator/server.py create mode 100644 validator/validator.py diff --git a/validator/.gitignore b/validator/.gitignore new file mode 100644 index 0000000..bdaab25 --- /dev/null +++ b/validator/.gitignore @@ -0,0 +1 @@ +env/ diff --git a/validator/requirements.txt b/validator/requirements.txt new file mode 100644 index 0000000..f565733 --- /dev/null +++ b/validator/requirements.txt @@ -0,0 +1,18 @@ +certifi==2020.12.5 +chardet==4.0.0 +click==7.1.2 +Flask==1.1.2 +geneea-nlp-client==1.2.0 +idna==2.10 +itsdangerous==1.1.0 +Jinja2==2.11.2 +joblib==1.0.0 +MarkupSafe==1.1.1 +nltk==3.5 +regex==2020.11.13 +requests==2.25.1 +retrying==1.3.3 +six==1.15.0 +tqdm==4.56.0 +urllib3==1.26.2 +Werkzeug==1.0.1 diff --git a/validator/server.py b/validator/server.py new file mode 100644 index 0000000..770473d --- /dev/null +++ b/validator/server.py @@ -0,0 +1,10 @@ +from flask import Flask +import json + +app = Flask(__name__) + +@app.route('/') +def index(): + return json.dumps({"Hello": "World!"}) + +app.run() \ No newline at end of file diff --git a/validator/validator.py b/validator/validator.py new file mode 100644 index 0000000..8bc3cd1 --- /dev/null +++ b/validator/validator.py @@ -0,0 +1,31 @@ +def process_jpg(f): + pass + +def process_pdf(f): + pass + +def process_docx(f): + pass + +def process_file(f): + pass + +def validate_court(lawsuit): + pass + +def validate_accuser(lawsuit): + pass + +# hard to implement +def validate_topic(): + pass + +# also hard to implement +def validate_intent(): + pass + +def validate_signature(): + pass + +def validate_date(): + pass \ No newline at end of file From 3dee27732c57594532138c4fac7987862f6a21b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franti=C5=A1ek=20Kmje=C4=8D?= Date: Fri, 22 Jan 2021 21:17:08 +0100 Subject: [PATCH 2/6] Add primitive file uploading for testing --- validator/.gitignore | 1 + validator/server.py | 18 ++++++++++++++++-- validator/templates/index.html | 9 +++++++++ validator/validator.py | 3 +++ 4 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 validator/templates/index.html diff --git a/validator/.gitignore b/validator/.gitignore index bdaab25..06a47c5 100644 --- a/validator/.gitignore +++ b/validator/.gitignore @@ -1 +1,2 @@ env/ +__pycache__/ diff --git a/validator/server.py b/validator/server.py index 770473d..46e4940 100644 --- a/validator/server.py +++ b/validator/server.py @@ -1,10 +1,24 @@ -from flask import Flask +from flask import Flask, render_template, request import json +import validator app = Flask(__name__) @app.route('/') def index(): - return json.dumps({"Hello": "World!"}) + return render_template('index.html') + +@app.route('/validator', methods=['GET', 'POST']) +def validate(): + if request.method == 'POST': + print(request.files) + f = request.files['file'] + + text = validator.process_file(f) + result = validator.validate(text) + return jsonify(result) + else: + return 'Soubor byl zvalidován. TODO musím ověřit, jak byl zvalidován.' # TODO change + app.run() \ No newline at end of file diff --git a/validator/templates/index.html b/validator/templates/index.html new file mode 100644 index 0000000..aeb843c --- /dev/null +++ b/validator/templates/index.html @@ -0,0 +1,9 @@ + + +
+ + +
+ + \ No newline at end of file diff --git a/validator/validator.py b/validator/validator.py index 8bc3cd1..f3dffe2 100644 --- a/validator/validator.py +++ b/validator/validator.py @@ -28,4 +28,7 @@ def validate_signature(): pass def validate_date(): + pass + +def validate(text): pass \ No newline at end of file From c51386f1959e68b149b96849eb2dd5f89afa36d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franti=C5=A1ek=20Kmje=C4=8D?= Date: Fri, 22 Jan 2021 21:53:52 +0100 Subject: [PATCH 3/6] Add pdf to text --- validator/requirements.txt | 2 ++ validator/server.py | 2 +- validator/validator.py | 19 +++++++++++++++++-- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/validator/requirements.txt b/validator/requirements.txt index f565733..fe30b93 100644 --- a/validator/requirements.txt +++ b/validator/requirements.txt @@ -9,6 +9,8 @@ Jinja2==2.11.2 joblib==1.0.0 MarkupSafe==1.1.1 nltk==3.5 +pdftotext==2.1.5 +PyPDF2==1.26.0 regex==2020.11.13 requests==2.25.1 retrying==1.3.3 diff --git a/validator/server.py b/validator/server.py index 46e4940..bad18ef 100644 --- a/validator/server.py +++ b/validator/server.py @@ -1,4 +1,4 @@ -from flask import Flask, render_template, request +from flask import Flask, render_template, request, jsonify import json import validator diff --git a/validator/validator.py b/validator/validator.py index f3dffe2..c4620c8 100644 --- a/validator/validator.py +++ b/validator/validator.py @@ -1,14 +1,29 @@ +import os +import pdftotext + def process_jpg(f): pass def process_pdf(f): - pass + pdf = pdftotext.PDF(f) + aggregate = "\n\n".join(pdf) + return aggregate def process_docx(f): pass def process_file(f): - pass + # TODO proper file format distinguishing, not only by suffix? + _, ext = os.path.splitext(f.filename) + print(ext) + if ext == '.jpg': + return process_jpg(f) + elif ext == '.pdf': + return process_pdf(f) + elif ext == '.docx': + return process_docx(f) + + def validate_court(lawsuit): pass From 8d58a4e4c2ccab1987ede64f19fac00b86e59882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franti=C5=A1ek=20Kmje=C4=8D?= Date: Fri, 22 Jan 2021 23:00:22 +0100 Subject: [PATCH 4/6] Add tesseract text extraction from jpg --- validator/requirements.txt | 4 +++- validator/validator.py | 14 +++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/validator/requirements.txt b/validator/requirements.txt index fe30b93..cb7a7d2 100644 --- a/validator/requirements.txt +++ b/validator/requirements.txt @@ -1,6 +1,7 @@ certifi==2020.12.5 chardet==4.0.0 click==7.1.2 +docx2txt==0.8 Flask==1.1.2 geneea-nlp-client==1.2.0 idna==2.10 @@ -10,7 +11,8 @@ joblib==1.0.0 MarkupSafe==1.1.1 nltk==3.5 pdftotext==2.1.5 -PyPDF2==1.26.0 +Pillow==8.1.0 +pytesseract==0.3.7 regex==2020.11.13 requests==2.25.1 retrying==1.3.3 diff --git a/validator/validator.py b/validator/validator.py index c4620c8..0047789 100644 --- a/validator/validator.py +++ b/validator/validator.py @@ -1,8 +1,12 @@ import os import pdftotext +import docx2txt +import pytesseract +from PIL import Image def process_jpg(f): - pass + text = pytesseract.image_to_string(Image.open(f), lang="ces") + return text def process_pdf(f): pdf = pdftotext.PDF(f) @@ -10,7 +14,9 @@ def process_pdf(f): return aggregate def process_docx(f): - pass + # TODO weird output with many spaces + return docx2txt.process(f) + def process_file(f): # TODO proper file format distinguishing, not only by suffix? @@ -21,9 +27,7 @@ def process_file(f): elif ext == '.pdf': return process_pdf(f) elif ext == '.docx': - return process_docx(f) - - + return process_docx(f) def validate_court(lawsuit): pass From b7fbff1674971aa6b6b3df58ced956173fef5e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20K=C3=A1n=C4=9B?= Date: Sat, 23 Jan 2021 00:33:55 +0100 Subject: [PATCH 5/6] Geneea demo --- flake.nix | 40 ++++++++++++++++++++++++++++++++++++++++ validator/geneea.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 flake.nix create mode 100755 validator/geneea.py diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..bcc5d50 --- /dev/null +++ b/flake.nix @@ -0,0 +1,40 @@ +{ + description = "Res judicata"; + + inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-20.09"; + + outputs = { self, nixpkgs }: + let + pkgs = nixpkgs.legacyPackages.x86_64-linux; + in + { + + packages.x86_64-linux.form-dev = pkgs.mkShell { + buildInputs = [ pkgs.nodejs-14_x ]; + shellHook = '' + ''; + }; + + packages.x86_64-linux.validator-dev = + let + geneea_sdk = pkgs.python38Packages.buildPythonPackage rec { + pname = "geneea-nlp-client"; + version = "1.2.0"; + src = pkgs.python38Packages.fetchPypi { + inherit pname version; + sha256 = "Q0fYD3V0NbUOItpCwA6ExIy7sIcQxpfqr1aPSXd4+cc="; + }; + propagatedBuildInputs = with pkgs.python38Packages; [ requests retrying ]; + doCheck = false; + }; + in + pkgs.mkShell { + buildInputs = [ pkgs.python38 pkgs.python38Packages.pip geneea_sdk ]; + shellHook = '' + read -p 'Insert Geneea API key:' -r -s geneea_api_key + export geneea_api_key + ''; + }; + + }; +} diff --git a/validator/geneea.py b/validator/geneea.py new file mode 100755 index 0000000..cbc61ae --- /dev/null +++ b/validator/geneea.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +import os +from geneeanlpclient import g3 + + +def getKey() -> str: + key = os.getenv("geneea_api_key") + if key is None: + raise ValueError("$geneea_api_key env var was not set") + return key + + +def doTheTest(): + builder = g3.Request.Builder(analyses=[g3.AnalysisType.ALL], returnMentions=True, returnItemSentiment=True, domain=g3.Domain.MEDIA, textType="Business", referenceDate="2015-03-14", diacritization="redo", language="cs") + with g3.Client.create(userKey=getKey()) as analyzer: + result = analyzer.analyze( + builder.build(id=str(1), text='Koláč s rozinkami panu Karlovi moc chutnal.', language="cs")) + + # print(result) + + # for t in result.tokens: + # print(f'{t} – lemma "{t.lemma}"') + + for r in result.relations: + # print(r) + print(f'{r.type} {r.textRepr} - {r.name}') + + +if __name__ == '__main__': + doTheTest() From fc42af41f814aa1fdf1566a9a4be15f93d4de7c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vojt=C4=9Bch=20K=C3=A1n=C4=9B?= Date: Sat, 23 Jan 2021 00:36:47 +0100 Subject: [PATCH 6/6] fixup: Add flake.lock file --- flake.lock | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 flake.lock diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..5a9f29a --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1611179033, + "narHash": "sha256-NMepSdxJt8mFfgWHUT2o7u2yKZ8l2KD+FWbNDiR8Ufk=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "da3378c4aaf2ed350ad14552558fa55bb68d96d3", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-20.09", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +}