Develop (#32)

* Intial commit of squeaky clean text * updated the sct.py script with modular code * updated the sct.py script with pipeline method, which would ideally would help to make changes in the processing easier * removed unnecessary direction code * adding to do list * adding to do list * added requiremnt.txt file * added setup.py file * added test cases * updated config file * merging back * Develop (#2) (#3) * Intial commit of squeaky clean text * updated the sct.py script with modular code * updated the sct.py script with pipeline method, which would ideally would help to make changes in the processing easier * removed unnecessary direction code * adding to do list * adding to do list * added requiremnt.txt file * added setup.py file * added test cases * updated config file * merging back * rebase * update the license * added German and Spanish support * Updated file for pypi * Updated readme file * Add GitHub Actions workflow for publishing to PyPI * Updated readme file * Updated readme file * added the username to the publish.yml * update the API vriable name * update the API user name * Bump version to 0.1.1 * updated the readme file * updated the version * Update NER Process and added tag removal * Updated congig file * updated the code to have the option to not output language * fixed the bug for NER which was refrencing to the wrong model variable names, add the gpu support * fixed the Anonomyser Engine * fixed the Anonomyser Engine * added the test.yml file * added the test.yml file * added the test.yml file * added the German and Spanish language support in lingua * added the ability in the config to change the model name * added the ability in the config to change the model name * added the ability in the config to change the model name and fixed spanish model name * squased some bugs * added the language passing support * Refactored the code * fixed typing issue * reverted the refactor
rhnfzl · Aug 17, 2024 · 97e3bb0 · 97e3bb0
1 parent 1ce8f44
commit 97e3bb0
Show file tree

Hide file tree

Showing 10 changed files with 174 additions and 765 deletions.
diff --git a/sct/config.py b/sct/config.py
@@ -1,100 +1,57 @@
 """
-Module containing the configuration parameters for the SCT package.
+    detect_language : to detect the language automatically, but would consume more time if done on a batch
+    fix_bad_unicode : if True, fix "broken" unicode such as mojibake and garbled HTML entities
+    to_ascii_unicode : if True, convert non-to_ascii characters into their closest to_ascii equivalents
+    replace_with_url : special URL token, default "",
+    replace_with_email : special EMAIL token, default "",
+    replace_years : replace year, default "",
+    replace_with_phone_number : special PHONE token, default "",
+    replace_with_number : special NUMBER token, default "",
+    no_currency_symbols : if True, replace all currency symbols with the respective alphabetical ones,
+    ner_process : To execute NER Process to remove the positpositional tags, PER, LOC, ORG, MISC
+    remove_isolated_letters : remove any isolated letters which doesn't add any value to the text
+    remove_isolated_symbols : remove any isolated symbols which shouldn't be present in the text, usually which isn't 
+                            immediatly prefixed and suffixed by letter or number
+    normalize_whitespace : remove any unnecessary whitespace
+    statistical_model_processing : to get the statistical model text, like for fastText, SVM, LR etc
+    casefold : to lower the text
+    remove_stopwords : remove stopwords based on the language, usues NLTK stopwords
+    remove_punctuation : removes all the special symbols
 """
 
-# Flag to detect the language automatically. If True, the language will be detected for each text.
 CHECK_DETECT_LANGUAGE = True
-
-# Flag to fix "broken" unicode such as mojibake and garbled HTML entities.
 CHECK_FIX_BAD_UNICODE = True
-
-# Flag to convert non-ASCII characters into their closest ASCII equivalents.
 CHECK_TO_ASCII_UNICODE = True
-
-# Flag to replace HTML tags with a special token.
 CHECK_REPLACE_HTML = True
-
-# Flag to replace URLs with a special token.
 CHECK_REPLACE_URLS = True
-
-# Flag to replace email addresses with a special token.
 CHECK_REPLACE_EMAILS = True
-
-# Flag to replace years with a special token.
 CHECK_REPLACE_YEARS = True
-
-# Flag to replace phone numbers with a special token.
 CHECK_REPLACE_PHONE_NUMBERS = True
-
-# Flag to replace numbers with a special token.
 CHECK_REPLACE_NUMBERS = True
-
-# Flag to replace currency symbols with their respective alphabetical equivalents.
 CHECK_REPLACE_CURRENCY_SYMBOLS = True
-
-# Flag to execute Named Entity Recognition (NER) to remove positional tags such as PER, LOC, ORG, MISC.
 CHECK_NER_PROCESS = True
-
-# Flag to remove any isolated letters which do not add any value to the text.
 CHECK_REMOVE_ISOLATED_LETTERS = True
-
-# Flag to remove any isolated symbols which should not be present in the text.
 CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
-
-# Flag to remove any unnecessary whitespace.
 CHECK_NORMALIZE_WHITESPACE = True
-
-# Flag to get the statistical model text, such as for fastText, SVM, LR.
 CHECK_STATISTICAL_MODEL_PROCESSING = True
-
-# Flag to convert all characters to lowercase.
 CHECK_CASEFOLD = True
-
-# Flag to remove stopwords based on the language. Uses NLTK stopwords.
 CHECK_REMOVE_STOPWORDS = True
-
-# Flag to remove all special symbols.
 CHECK_REMOVE_PUNCTUATION = True
-
-# Flag to remove custom stopwords specific to the SCT package.
 CHECK_REMOVE_STEXT_CUSTOM_STOP_WORDS = True
-
-# Special token to replace URLs.
 REPLACE_WITH_URL = "<URL>"
-
-# Special token to replace HTML tags.
 REPLACE_WITH_HTML = "<HTML>"
-
-# Special token to replace email addresses.
 REPLACE_WITH_EMAIL = "<EMAIL>"
-
-# Special token to replace years.
 REPLACE_WITH_YEARS = "<YEAR>"
-
-# Special token to replace phone numbers.
 REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
-
-# Special token to replace numbers.
 REPLACE_WITH_NUMBERS = "<NUMBER>"
-
-# Special token to replace currency symbols. If None, symbols will be replaced with their 3-letter abbreviations.
 REPLACE_WITH_CURRENCY_SYMBOLS = None
-
-# List of positional tags to be removed by NER.
 POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
-
-# Confidence threshold for NER.
 NER_CONFIDENCE_THRESHOLD = 0.85
-
-# Language to be used for NER. If None, the language will be detected automatically.
 LANGUAGE = None
 
-# List of pre-trained NER models in order of importance.
-NER_MODELS_LIST = [
-    "FacebookAI/xlm-roberta-large-finetuned-conll03-english",  # English Model
-    "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",  # Dutch Model
-    "FacebookAI/xlm-roberta-large-finetuned-conll03-german",  # German Model
-    "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",  # Spanish Model
-    "Babelscape/wikineural-multilingual-ner"  # Multilingual Model
-]
-
+# Order of the model is Important : English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
+NER_MODELS_LIST = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",
+              "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
+              "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
+              "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
+              "Babelscape/wikineural-multilingual-ner"]