From bc59a8307234eec1041054897646cdc695f85020 Mon Sep 17 00:00:00 2001 From: Svenja Date: Wed, 16 Aug 2023 14:04:27 +0200 Subject: [PATCH 1/6] next try --- generators/__init__.py | 6 ++++ generators/numbers/annotator_split/README.md | 1 + .../numbers/annotator_split/__init__.py | 18 ++++++++++ .../annotator_split/code_snippet_common.md | 17 +++++++++ .../annotator_split/code_snippet_refinery.md | 14 ++++++++ generators/numbers/annotator_split/config.py | 36 +++++++++++++++++++ 6 files changed, 92 insertions(+) create mode 100644 generators/numbers/annotator_split/README.md create mode 100644 generators/numbers/annotator_split/__init__.py create mode 100644 generators/numbers/annotator_split/code_snippet_common.md create mode 100644 generators/numbers/annotator_split/code_snippet_refinery.md create mode 100644 generators/numbers/annotator_split/config.py diff --git a/generators/__init__.py b/generators/__init__.py index d25aae0e..b890e750 100644 --- a/generators/__init__.py +++ b/generators/__init__.py @@ -1,3 +1,4 @@ + from fastapi import APIRouter from .distance import ( @@ -11,6 +12,10 @@ spacy_lemmatizer, ) +from .numbers import ( + annotator_split +) + from .paths import ( url_keyword_parser, domain_parser, @@ -96,6 +101,7 @@ bert_toxicity_detector, gpt_grammar_correction, gpt_tldr_summarization, + annotator_split, ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/generators/numbers/annotator_split/README.md b/generators/numbers/annotator_split/README.md new file mode 100644 index 00000000..651edc4c --- /dev/null +++ b/generators/numbers/annotator_split/README.md @@ -0,0 +1 @@ +The module gives out a random number within a given frame. With that, you can split annotations randomly. \ No newline at end of file diff --git a/generators/numbers/annotator_split/__init__.py b/generators/numbers/annotator_split/__init__.py new file mode 100644 index 00000000..fffb7fff --- /dev/null +++ b/generators/numbers/annotator_split/__init__.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel +import random + +INPUT_EXAMPLE = { + "number": 4356 +} + + +class AnnotationSplitModel(BaseModel): + number: int + + class Config: + schema_example = {"example": INPUT_EXAMPLE} + +def annotator_split(request: AnnotationSplitModel): + """Generates a random number for split annotation""" + number = request.number + return random.randint(0, number-1) diff --git a/generators/numbers/annotator_split/code_snippet_common.md b/generators/numbers/annotator_split/code_snippet_common.md new file mode 100644 index 00000000..866e7090 --- /dev/null +++ b/generators/numbers/annotator_split/code_snippet_common.md @@ -0,0 +1,17 @@ +```python +import random + +def annotator_split(record) -> int: + return random.randint(0, record-1) + +# ↑ necessary bricks function +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +def example_integration(): + max_number = [3,5,1000,43694,1,14,0,13.5] + for number in max_number: + print(f"the random number in the maximal number-range of {number} is {annotator_split(number)}") + +example_integration() +``` \ No newline at end of file diff --git a/generators/numbers/annotator_split/code_snippet_refinery.md b/generators/numbers/annotator_split/code_snippet_refinery.md new file mode 100644 index 00000000..a6187dc3 --- /dev/null +++ b/generators/numbers/annotator_split/code_snippet_refinery.md @@ -0,0 +1,14 @@ + +```python +import random + +ATTRIBUTE: int + +def annotator_split(record): + try: + return random.randint(0, record[ATTRIBUTE]-1) + except: + print("Something went wrong. Please make sure, your desired maximal number is an Integer and bigger than 0.") + + +``` diff --git a/generators/numbers/annotator_split/config.py b/generators/numbers/annotator_split/config.py new file mode 100644 index 00000000..3eaa6083 --- /dev/null +++ b/generators/numbers/annotator_split/config.py @@ -0,0 +1,36 @@ +from util.configs import build_generator_function_config +from util.enums import State, RefineryDataType, BricksVariableType, SelectionType +from . import annotator_split, INPUT_EXAMPLE + + +def get_config(): + return build_generator_function_config( + function=annotator_split, + input_example=INPUT_EXAMPLE, + issue_id=240, + tabler_icon="Dice-3", + min_refinery_version="1.7.0", + state=State.PUBLIC.value, + type="python_function", + kern_token_proxy_usable="false", + docker_image="none", + available_for=["refinery", "common"], + part_of_group=[ + "text_analytics", + ], # first entry should be parent directory + # bricks integrator information + integrator_inputs={ + "name": "annotator_split", + "refineryDataType": RefineryDataType.TEXT.value, + "variables": { + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "description": "only text fields", + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value, + ], + }, + }, + }, + ) From fb349982452436da4de0c2f578d9c29515143df9 Mon Sep 17 00:00:00 2001 From: Svenja Date: Mon, 28 Aug 2023 14:30:36 +0200 Subject: [PATCH 2/6] make changes --- generators/numbers/annotator_split/__init__.py | 2 +- generators/numbers/annotator_split/code_snippet_common.md | 2 +- generators/numbers/annotator_split/config.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/generators/numbers/annotator_split/__init__.py b/generators/numbers/annotator_split/__init__.py index fffb7fff..4bc2c760 100644 --- a/generators/numbers/annotator_split/__init__.py +++ b/generators/numbers/annotator_split/__init__.py @@ -2,7 +2,7 @@ import random INPUT_EXAMPLE = { - "number": 4356 + "number": 4 } diff --git a/generators/numbers/annotator_split/code_snippet_common.md b/generators/numbers/annotator_split/code_snippet_common.md index 866e7090..ddbeac42 100644 --- a/generators/numbers/annotator_split/code_snippet_common.md +++ b/generators/numbers/annotator_split/code_snippet_common.md @@ -9,7 +9,7 @@ def annotator_split(record) -> int: # ↓ example implementation def example_integration(): - max_number = [3,5,1000,43694,1,14,0,13.5] + max_number = [2, 4, 6, 8, 10] for number in max_number: print(f"the random number in the maximal number-range of {number} is {annotator_split(number)}") diff --git a/generators/numbers/annotator_split/config.py b/generators/numbers/annotator_split/config.py index 3eaa6083..8e125fae 100644 --- a/generators/numbers/annotator_split/config.py +++ b/generators/numbers/annotator_split/config.py @@ -16,14 +16,14 @@ def get_config(): docker_image="none", available_for=["refinery", "common"], part_of_group=[ - "text_analytics", + "numbers", ], # first entry should be parent directory # bricks integrator information integrator_inputs={ "name": "annotator_split", - "refineryDataType": RefineryDataType.TEXT.value, + "refineryDataType": RefineryDataType.INTEGER.value, "variables": { - "ATTRIBUTE": { + "N_SPLIT": { "selectionType": SelectionType.CHOICE.value, "description": "only text fields", "addInfo": [ From ef3bc478f393a1b60bebe43501f1a9811a23dce6 Mon Sep 17 00:00:00 2001 From: Svenja Date: Thu, 31 Aug 2023 18:28:49 +0200 Subject: [PATCH 3/6] email_cleaner --- generators/__init__.py | 7 ++- .../text_cleaning/email_cleaner/README.md | 1 + .../text_cleaning/email_cleaner/__init__.py | 49 +++++++++++++++++++ .../email_cleaner/code_snippet_common.md | 48 ++++++++++++++++++ .../email_cleaner/code_snippet_refinery.md | 13 +++++ .../text_cleaning/email_cleaner/config.py | 35 +++++++++++++ 6 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 generators/text_cleaning/email_cleaner/README.md create mode 100644 generators/text_cleaning/email_cleaner/__init__.py create mode 100644 generators/text_cleaning/email_cleaner/code_snippet_common.md create mode 100644 generators/text_cleaning/email_cleaner/code_snippet_refinery.md create mode 100644 generators/text_cleaning/email_cleaner/config.py diff --git a/generators/__init__.py b/generators/__init__.py index b890e750..f776175b 100644 --- a/generators/__init__.py +++ b/generators/__init__.py @@ -50,7 +50,11 @@ syllable_count, ) -from .text_cleaning import html_cleanser, html_unescape +from .text_cleaning import ( + email_cleaner, + html_cleanser, + html_unescape, +) from .translation import ( deepl_translator, @@ -102,6 +106,7 @@ gpt_grammar_correction, gpt_tldr_summarization, annotator_split, + email_cleaner, ]: module_name = module.__name__.split(".")[-1] model_name = ( diff --git a/generators/text_cleaning/email_cleaner/README.md b/generators/text_cleaning/email_cleaner/README.md new file mode 100644 index 00000000..daae5e9b --- /dev/null +++ b/generators/text_cleaning/email_cleaner/README.md @@ -0,0 +1 @@ +This modules removes certain aspects of the email to focus on the context. The aspects being removed are Sentences starting with "EXTERNAL MAIL", Brackets, that starts with "cid.image", everything after the Disclaimer or until the next Mail in case of response-mails and everything after the signiture in brackets or until the next Mail. \ No newline at end of file diff --git a/generators/text_cleaning/email_cleaner/__init__.py b/generators/text_cleaning/email_cleaner/__init__.py new file mode 100644 index 00000000..518ac8a1 --- /dev/null +++ b/generators/text_cleaning/email_cleaner/__init__.py @@ -0,0 +1,49 @@ +from pydantic import BaseModel +import re + +INPUT_EXAMPLE = { + """Hi Sofia, +I hope this email finds you well. I have some exciting news to share with you regarding a potential new client for StellarDefense Insurance. We have recently received an application from a company called Bleyerstift and More, who are in need of insurance coverage. Bleyerstift and More is a reputable company in the manufacturing industry. They operate in the pharmaceutical sector, specializing in the production of medical supplies. With a workforce of approximately 500 employees, they are located at 123 Main Street, Anytown, USA. You can find more information about them on their website at www.bleyerstiftandmore.com. +The client has requested a submission to be completed by April 1st, 2024. They are specifically interested in obtaining a comprehensive general liability insurance policy, with a coverage limit of $1 million for each occurrence. +Please let me know if you require any additional information from them or if there are any specific questions you would like me to address. As for attachments, there is a document that provides a detailed breakdown of Bleyerstift and More's revenue and other pertinent financial information. +I have included this attachment for your reference. I believe this opportunity has great potential for StellarDefense Insurance's growth and would appreciate your assistance in handling this case. If you have any questions or need any further information, please do not hesitate to reach out to me. Thank you for your time and support in this matter. +[cid:image012915.png@C10DB1A7.DEFECF3B] +Best regards, +Amelia Smith Insurance Broker StellarDefense Insurance + +DISCLAIMER + +The information contained in this communication from the sender is confidential. It is intended solely for use by the recipient and others authorized to receive it. If you are not the recipient, you are hereby notified that any disclosure, copying, distribution or taking action in relation of the contents of this information is strictly prohibited and may be unlawful. + +This email has been scanned for viruses and malware, and may have been automatically archived by blubb. + +From: Bender, Zoe +Sent: 22 September 2022 16:55 +To: Smith, Amelia +Subject: Small question + +EXTERNAL EMAIL: This email originated from outside StellarDefense. +Dear Amelia, +I just wanted to know if you have new information for me. If I remember correctly, you told me about a great deal with a new company. Love to hear more about it. +All best +Zoe +[signature]""" +} + +class EmailCleanerModel(BaseModel): + email: str + + class Config: + schema_extra = {"example": INPUT_EXAMPLE} + + +def email_cleaner(req: EmailCleanerModel): + text = req.email + text = re.sub("DISCLAIMER((\w|\s|\S))+?(?=From:|\Z)", "",text, flags=re.IGNORECASE) + text = re.sub("EXTERNAL EMAIL.*?(?=\.)\.", "", text, flags=re.IGNORECASE) + text = re.sub("\[cid:image.*?(?=\])\]", "",text, flags=re.IGNORECASE) + text = re.sub("signature((\w|\s|\S))+?(?=From:|\Z)","",text, flags=re.IGNORECASE) + return text + + + diff --git a/generators/text_cleaning/email_cleaner/code_snippet_common.md b/generators/text_cleaning/email_cleaner/code_snippet_common.md new file mode 100644 index 00000000..3c4f17ec --- /dev/null +++ b/generators/text_cleaning/email_cleaner/code_snippet_common.md @@ -0,0 +1,48 @@ +```python +import re + +def email_cleaner(text): + text = re.sub("DISCLAIMER((\w|\s|\S))+?(?=From:|\Z)", "",text, flags=re.IGNORECASE) + text = re.sub("EXTERNAL EMAIL.*?(?=\.)\.", "", text, re.IGNORECASE) + text = re.sub("\[cid:image.*?(?=\])\]", "",text, re.IGNORECASE) + text = re.sub("signature((\w|\s|\S))+?(?=From:|\Z)","",text, re.IGNORECASE) + return text + +# ↑ necessary bricks stuff +# ----------------------------------------------------------------------------------------- +# ↓ example implementation + +emails = ["""Hi Sofia, +I hope this email finds you well. I have some exciting news to share with you regarding a potential new client for StellarDefense Insurance. We have recently received an application from a company called Bleyerstift and More, who are in need of insurance coverage. Bleyerstift and More is a reputable company in the manufacturing industry. They operate in the pharmaceutical sector, specializing in the production of medical supplies. With a workforce of approximately 500 employees, they are located at 123 Main Street, Anytown, USA. You can find more information about them on their website at www.bleyerstiftandmore.com. +The client has requested a submission to be completed by April 1st, 2024. They are specifically interested in obtaining a comprehensive general liability insurance policy, with a coverage limit of $1 million for each occurrence. +Please let me know if you require any additional information from them or if there are any specific questions you would like me to address. As for attachments, there is a document that provides a detailed breakdown of Bleyerstift and More's revenue and other pertinent financial information. +I have included this attachment for your reference. I believe this opportunity has great potential for StellarDefense Insurance's growth and would appreciate your assistance in handling this case. If you have any questions or need any further information, please do not hesitate to reach out to me. Thank you for your time and support in this matter. +[cid:image012915.png@C10DB1A7.DEFECF3B] +Best regards, +Amelia Smith Insurance Broker StellarDefense Insurance + +DISCLAIMER + +The information contained in this communication from the sender is confidential. It is intended solely for use by the recipient and others authorized to receive it. If you are not the recipient, you are hereby notified that any disclosure, copying, distribution or taking action in relation of the contents of this information is strictly prohibited and may be unlawful. + +This email has been scanned for viruses and malware, and may have been automatically archived by blubb. + +From: Bender, Zoe +Sent: 22 September 2022 16:55 +To: Smith, Amelia +Subject: Small question + +EXTERNAL EMAIL: This email originated from outside StellarDefense. +Dear Amelia, +I just wanted to know if you have new information for me. If I remember correctly, you told me about a great deal with a new company. Love to hear more about it. +All best +Zoe +[signature]"""] + +def example_integration(): + texts = emails + for text in texts: + print(f"the emails will looked cleansed like this:\n{email_cleaner(text)}") +example_integration() + +``` \ No newline at end of file diff --git a/generators/text_cleaning/email_cleaner/code_snippet_refinery.md b/generators/text_cleaning/email_cleaner/code_snippet_refinery.md new file mode 100644 index 00000000..b99633bd --- /dev/null +++ b/generators/text_cleaning/email_cleaner/code_snippet_refinery.md @@ -0,0 +1,13 @@ +```python +import re + +ATTRIBUTE: str = "headline" #only text attributes + +def email_cleaner(record): + text = record[ATTRIBUTE].text + text = re.sub("DISCLAIMER((\w|\s|\S))+?(?=From:|\Z)", "",text, flags=re.IGNORECASE) + text = re.sub("EXTERNAL EMAIL.*?(?=\.)\.", "", text, re.IGNORECASE) + text = re.sub("\[cid:image.*?(?=\])\]", "",text, re.IGNORECASE) + text = re.sub("signature((\w|\s|\S))+?(?=From:|\Z)","",text, re.IGNORECASE) + return text +``` \ No newline at end of file diff --git a/generators/text_cleaning/email_cleaner/config.py b/generators/text_cleaning/email_cleaner/config.py new file mode 100644 index 00000000..ac9278d7 --- /dev/null +++ b/generators/text_cleaning/email_cleaner/config.py @@ -0,0 +1,35 @@ +from util.configs import build_generator_function_config +from util.enums import State, RefineryDataType, BricksVariableType, SelectionType +from . import html_cleanser, INPUT_EXAMPLE + + +def get_config(): + return build_generator_function_config( + function=html_cleanser, + input_example=INPUT_EXAMPLE, + issue_id=328, + tabler_icon="square-rounded-letter-e", + min_refinery_version="1.7.0", + state=State.PUBLIC.value, + type="python_function", + kern_token_proxy_usable="false", + docker_image="none", + available_for=["refinery", "common"], + part_of_group=[ + "text_cleaning", + ], # first entry should be parent directory + # bricks integrator information + integrator_inputs={ + "name": "email_cleaner", + "refineryDataType": RefineryDataType.TEXT.value, + "variables": { + "ATTRIBUTE": { + "selectionType": SelectionType.CHOICE.value, + "addInfo": [ + BricksVariableType.ATTRIBUTE.value, + BricksVariableType.GENERIC_STRING.value, + ], + } + }, + }, + ) From fa71ec2a5bf79c366f566d3dd6b7e12ee11c6aaf Mon Sep 17 00:00:00 2001 From: Svenja Date: Fri, 1 Sep 2023 15:54:09 +0200 Subject: [PATCH 4/6] delete annotator_split --- generators/numbers/annotator_split/README.md | 1 - .../numbers/annotator_split/__init__.py | 18 ---------- .../annotator_split/code_snippet_common.md | 17 --------- .../annotator_split/code_snippet_refinery.md | 14 -------- generators/numbers/annotator_split/config.py | 36 ------------------- 5 files changed, 86 deletions(-) delete mode 100644 generators/numbers/annotator_split/README.md delete mode 100644 generators/numbers/annotator_split/__init__.py delete mode 100644 generators/numbers/annotator_split/code_snippet_common.md delete mode 100644 generators/numbers/annotator_split/code_snippet_refinery.md delete mode 100644 generators/numbers/annotator_split/config.py diff --git a/generators/numbers/annotator_split/README.md b/generators/numbers/annotator_split/README.md deleted file mode 100644 index 651edc4c..00000000 --- a/generators/numbers/annotator_split/README.md +++ /dev/null @@ -1 +0,0 @@ -The module gives out a random number within a given frame. With that, you can split annotations randomly. \ No newline at end of file diff --git a/generators/numbers/annotator_split/__init__.py b/generators/numbers/annotator_split/__init__.py deleted file mode 100644 index 4bc2c760..00000000 --- a/generators/numbers/annotator_split/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -from pydantic import BaseModel -import random - -INPUT_EXAMPLE = { - "number": 4 -} - - -class AnnotationSplitModel(BaseModel): - number: int - - class Config: - schema_example = {"example": INPUT_EXAMPLE} - -def annotator_split(request: AnnotationSplitModel): - """Generates a random number for split annotation""" - number = request.number - return random.randint(0, number-1) diff --git a/generators/numbers/annotator_split/code_snippet_common.md b/generators/numbers/annotator_split/code_snippet_common.md deleted file mode 100644 index ddbeac42..00000000 --- a/generators/numbers/annotator_split/code_snippet_common.md +++ /dev/null @@ -1,17 +0,0 @@ -```python -import random - -def annotator_split(record) -> int: - return random.randint(0, record-1) - -# ↑ necessary bricks function -# ----------------------------------------------------------------------------------------- -# ↓ example implementation - -def example_integration(): - max_number = [2, 4, 6, 8, 10] - for number in max_number: - print(f"the random number in the maximal number-range of {number} is {annotator_split(number)}") - -example_integration() -``` \ No newline at end of file diff --git a/generators/numbers/annotator_split/code_snippet_refinery.md b/generators/numbers/annotator_split/code_snippet_refinery.md deleted file mode 100644 index a6187dc3..00000000 --- a/generators/numbers/annotator_split/code_snippet_refinery.md +++ /dev/null @@ -1,14 +0,0 @@ - -```python -import random - -ATTRIBUTE: int - -def annotator_split(record): - try: - return random.randint(0, record[ATTRIBUTE]-1) - except: - print("Something went wrong. Please make sure, your desired maximal number is an Integer and bigger than 0.") - - -``` diff --git a/generators/numbers/annotator_split/config.py b/generators/numbers/annotator_split/config.py deleted file mode 100644 index 8e125fae..00000000 --- a/generators/numbers/annotator_split/config.py +++ /dev/null @@ -1,36 +0,0 @@ -from util.configs import build_generator_function_config -from util.enums import State, RefineryDataType, BricksVariableType, SelectionType -from . import annotator_split, INPUT_EXAMPLE - - -def get_config(): - return build_generator_function_config( - function=annotator_split, - input_example=INPUT_EXAMPLE, - issue_id=240, - tabler_icon="Dice-3", - min_refinery_version="1.7.0", - state=State.PUBLIC.value, - type="python_function", - kern_token_proxy_usable="false", - docker_image="none", - available_for=["refinery", "common"], - part_of_group=[ - "numbers", - ], # first entry should be parent directory - # bricks integrator information - integrator_inputs={ - "name": "annotator_split", - "refineryDataType": RefineryDataType.INTEGER.value, - "variables": { - "N_SPLIT": { - "selectionType": SelectionType.CHOICE.value, - "description": "only text fields", - "addInfo": [ - BricksVariableType.ATTRIBUTE.value, - BricksVariableType.GENERIC_STRING.value, - ], - }, - }, - }, - ) From 60e73df1d48153f2d0ab5ae98637db9ab0b1e4b4 Mon Sep 17 00:00:00 2001 From: Svenja Date: Wed, 6 Sep 2023 14:03:45 +0200 Subject: [PATCH 5/6] update config --- generators/text_cleaning/email_cleaner/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generators/text_cleaning/email_cleaner/config.py b/generators/text_cleaning/email_cleaner/config.py index ac9278d7..7d6a39a2 100644 --- a/generators/text_cleaning/email_cleaner/config.py +++ b/generators/text_cleaning/email_cleaner/config.py @@ -1,11 +1,11 @@ from util.configs import build_generator_function_config from util.enums import State, RefineryDataType, BricksVariableType, SelectionType -from . import html_cleanser, INPUT_EXAMPLE +from . import email_cleaner, INPUT_EXAMPLE def get_config(): return build_generator_function_config( - function=html_cleanser, + function=email_cleaner, input_example=INPUT_EXAMPLE, issue_id=328, tabler_icon="square-rounded-letter-e", From 9111283c5efe277c4acb99d7e466c1008802aa47 Mon Sep 17 00:00:00 2001 From: Svenja Date: Tue, 12 Sep 2023 18:22:22 +0200 Subject: [PATCH 6/6] rename --- generators/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/generators/__init__.py b/generators/__init__.py index f776175b..f70ce685 100644 --- a/generators/__init__.py +++ b/generators/__init__.py @@ -105,8 +105,8 @@ bert_toxicity_detector, gpt_grammar_correction, gpt_tldr_summarization, - annotator_split, email_cleaner, + annotator_split, ]: module_name = module.__name__.split(".")[-1] model_name = (