Basic court regex validator
This commit is contained in:
parent
957f5efb0c
commit
d6ad798e96
@ -37,7 +37,7 @@
|
||||
};
|
||||
in
|
||||
pkgs.mkShell {
|
||||
buildInputs = with pkgs.python38Packages; [ pkgs.python38 pip geneea_sdk pytesseract pdftotext pillow docx2txt flask ];
|
||||
buildInputs = with pkgs.python38Packages; [ pkgs.python38 pip geneea_sdk pytesseract pdftotext pillow docx2txt flask regex ];
|
||||
shellHook = ''
|
||||
read -p 'Insert Geneea API key:' -r -s geneea_api_key
|
||||
export geneea_api_key
|
||||
|
@ -3,6 +3,7 @@ import pdftotext
|
||||
import docx2txt
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import regex as re
|
||||
|
||||
def process_jpg(f):
|
||||
text = pytesseract.image_to_string(Image.open(f), lang="ces")
|
||||
@ -16,7 +17,7 @@ def process_pdf(f):
|
||||
def process_docx(f):
|
||||
# TODO weird output with many spaces
|
||||
return docx2txt.process(f)
|
||||
|
||||
|
||||
|
||||
def process_file(f):
|
||||
# TODO proper file format distinguishing, not only by suffix?
|
||||
@ -27,10 +28,11 @@ def process_file(f):
|
||||
elif ext == '.pdf':
|
||||
return process_pdf(f)
|
||||
elif ext == '.docx':
|
||||
return process_docx(f)
|
||||
return process_docx(f)
|
||||
|
||||
def validate_court(lawsuit):
|
||||
pass
|
||||
court_pat = re.compile(r"(okresní|krajský|vrchní|nejvyšší(\s+správní\S*)|ústavní)\S*\s+soud\S*(\s+ve?)?\s+(\S+)", flags=re.IGNORECASE)
|
||||
def validate_court(lawsuit: str):
|
||||
return court_pat.match(lawsuit)
|
||||
|
||||
def validate_accuser(lawsuit):
|
||||
pass
|
||||
@ -50,4 +52,9 @@ def validate_date():
|
||||
pass
|
||||
|
||||
def validate(text):
|
||||
pass
|
||||
pass
|
||||
|
||||
# debug
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
print(validate_court(str(sys.stdin.read())))
|
||||
|
Loading…
Reference in New Issue
Block a user