rhnfzl · rhnfzl · Aug 17, 2024 · Jun 15, 2024 · Jun 15, 2024 · Jun 15, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # `SqueakyCleanText` 
 
-[![PyPI](https://img.shields.io/pypi/v/squeakycleantext.svg)](https://pypi.org/project/squeakycleantext/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/squeakycleantext.svg)](https://pypi.org/project/squeakycleantext/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/squeakycleantext)](https://pypistats.org/packages/squeakycleantext)
+[![PyPI](https://img.shields.io/pypi/v/squeakycleantext.svg)](https://pypi.org/project/squeakycleantext/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/squeakycleantext)](https://pypistats.org/packages/squeakycleantext)
 
 In the world of machine learning and natural language processing, clean and well-structured text data is crucial for building effective downstream models and managing token limits in language models. 
 
@@ -16,6 +16,7 @@ SqueakyCleanText helps achieve this by addressing common text issues by doing mo
 - Currency Symbols: Replaces all currency symbols with their alphabetical equivalents.
 - Whitespace Normalization: Removes unnecessary whitespace.
 - Detects the language of processed text if needed in downstream task.
+- Supports English, Dutch, German and Spanish language.
 - Provides text for both Lnaguage model processing and Statistical model processing.
 
 ##### Benefits for Statistical Models
@@ -118,11 +119,6 @@ Processes the input text and returns a tuple containing:
 - Cleaned text with stopwords removed.
 - Detected language of the text.
 
-## TODO
-
-- Add the ability to change the NER models from the config file, which AutoModel and AutoTokenizer.
-- Expand language support for stopwords to more European Languages.
-
 ## Contributing
 
 Contributions are welcome! Please feel free to submit a Pull Request or open an issue.

diff --git a/sct/config.py b/sct/config.py
@@ -47,4 +47,10 @@
 REPLACE_WITH_CURRENCY_SYMBOLS = None
 POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
 NER_CONFIDENCE_THRESHOLD = 0.85
-LANGUAGE = None
+LANGUAGE = None
+
+NER_MODELS_LIST = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english", 
+              "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
+              "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
+              "FacebookAI/xlm-roberta-large-finetuned-conll03-spanish",
+              "Babelscape/wikineural-multilingual-ner"]
diff --git a/sct/utils/ner.py b/sct/utils/ner.py
@@ -10,6 +10,7 @@
 from presidio_anonymizer.entities import RecognizerResult
 
 from sct.utils import constants
+from sct import config
 transformers.logging.set_verbosity_error() 
 
 class GeneralNER:
@@ -23,44 +24,48 @@ def __init__(self):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
         self.engine = AnonymizerEngine()
+
+        if len(config.NER_MODELS_LIST) == 5:
+            english_model_name = config.NER_MODELS_LIST[0]
+            dutch_model_name = config.NER_MODELS_LIST[1]
+            german_model_name = config.NER_MODELS_LIST[2]
+            spanish_model_name = config.NER_MODELS_LIST[3]
+            multilingual_model_name = config.NER_MODELS_LIST[4]
+        else:
+            # Load models
+            model_name = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",
+                          "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
+                        "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
+                        "FacebookAI/xlm-roberta-large-finetuned-conll03-spanish",
+                        "Babelscape/wikineural-multilingual-ner"]
+
+            english_model_name = model_name[0]
+            dutch_model_name = model_name[1]
+            german_model_name = model_name[2]
+            spanish_model_name = model_name[3]
+            multilingual_model_name = model_name[4]
+
+        self.en_tokenizer = AutoTokenizer.from_pretrained(english_model_name)
+        self.en_model = AutoModelForTokenClassification.from_pretrained(english_model_name).to(self.device)
+        self.en_ner_pipeline = pipeline("ner", model=self.en_model, tokenizer=self.en_tokenizer, aggregation_strategy="simple")
+
+        self.nl_tokenizer = AutoTokenizer.from_pretrained(dutch_model_name)
+        self.nl_model = AutoModelForTokenClassification.from_pretrained(dutch_model_name).to(self.device)
+        self.nl_ner_pipeline = pipeline("ner", model=self.nl_model, tokenizer=self.nl_tokenizer, aggregation_strategy="simple")
+
+        self.de_tokenizer = AutoTokenizer.from_pretrained(german_model_name)
+        self.de_model = AutoModelForTokenClassification.from_pretrained(german_model_name).to(self.device)
+        self.de_ner_pipeline = pipeline("ner", model=self.de_model, tokenizer=self.de_tokenizer, aggregation_strategy="simple")
+
+        self.es_tokenizer = AutoTokenizer.from_pretrained(spanish_model_name)
+        self.es_model = AutoModelForTokenClassification.from_pretrained(spanish_model_name).to(self.device)
+        self.es_ner_pipeline = pipeline("ner", model=self.es_model, tokenizer=self.es_tokenizer, aggregation_strategy="simple")
+
+        self.multi_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_name)
+        self.multi_model = AutoModelForTokenClassification.from_pretrained(multilingual_model_name).to(self.device)
+        self.multi_ner_pipeline = pipeline("ner", model=self.multi_model, tokenizer=self.multi_tokenizer, aggregation_strategy="simple")
 
-        self.en_tokenizer = AutoTokenizer.from_pretrained(
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-english")
-        self.en_model = AutoModelForTokenClassification.from_pretrained(
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-english").to(self.device)
-        self.en_ner_pipeline = pipeline(
-            "ner", model=self.en_model, tokenizer=self.en_tokenizer, aggregation_strategy="simple")
-
-        self.nl_tokenizer = AutoTokenizer.from_pretrained(
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch")
-        self.nl_model = AutoModelForTokenClassification.from_pretrained(
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch").to(self.device)
-        self.nl_ner_pipeline = pipeline(
-            "ner", model=self.nl_model, tokenizer=self.nl_tokenizer, aggregation_strategy="simple")
-
-        self.de_tokenizer = AutoTokenizer.from_pretrained(
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-german")
-        self.de_model = AutoModelForTokenClassification.from_pretrained(
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-german").to(self.device)
-        self.de_ner_pipeline = pipeline(
-            "ner", model=self.de_model, tokenizer=self.de_tokenizer, aggregation_strategy="simple")
-
-        self.es_tokenizer = AutoTokenizer.from_pretrained(
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish")
-        self.es_model = AutoModelForTokenClassification.from_pretrained(
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish").to(self.device)
-        self.es_ner_pipeline = pipeline(
-            "ner", model=self.es_model, tokenizer=self.es_tokenizer, aggregation_strategy="simple")
-
-        self.multi_tokenizer = AutoTokenizer.from_pretrained(
-            "Babelscape/wikineural-multilingual-ner")
-        self.multi_model = AutoModelForTokenClassification.from_pretrained(
-            "Babelscape/wikineural-multilingual-ner").to(self.device)
-        self.multi_ner_pipeline = pipeline(
-            "ner", model=self.multi_model, tokenizer=self.multi_tokenizer, aggregation_strategy="simple")
-
-        self.min_token_length = math.ceil(min(
-            self.en_tokenizer.max_len_single_sentence, self.multi_tokenizer.max_len_single_sentence) * 0.9)
+        self.min_token_length = math.ceil(min(self.en_tokenizer.max_len_single_sentence, self.multi_tokenizer.max_len_single_sentence) * 0.9)
 
         self.tokenizer = self.en_tokenizer if self.en_tokenizer.max_len_single_sentence <= self.multi_tokenizer.max_len_single_sentence else self.multi_tokenizer
 

diff --git a/setup.py b/setup.py
@@ -2,37 +2,49 @@
 
 setup(
     name='SqueakyCleanText',
-    version='0.1.8',
+    version='0.2.0',
     author='Rehan Fazal',
     description='A comprehensive text cleaning and preprocessing pipeline.',
-    long_description=open('README.md').read(),
+    long_description=open('README.md', encoding='utf-8').read(),
     long_description_content_type='text/markdown',
     url='https://github.com/rhnfzl/SqueakyCleanText',
     license='MIT',
     packages=find_packages(),
     install_requires=[
-        'lingua-language-detector',
-        'nltk',
-        'emoji',
-        'ftfy',
-        'Unidecode',
-        'beautifulsoup4',
-        'transformers',
-        'torch',
-        'presidio_anonymizer',
+        'lingua-language-detector>=2.0.0,<2.1',
+        'nltk>=3.8,<3.9',
+        'emoji>=2.8,<2.9',
+        'ftfy>=6.1,<6.2',
+        'Unidecode>=1.3,<1.4',
+        'beautifulsoup4>=4.12,<4.13',
+        'transformers>=4.30,<4.31',
+        'torch>=2.0,<2.1',
+        'presidio_anonymizer>=2.2.355,<2.3',
     ],
     extras_require={
         'dev': [
-            'hypothesis',
-            'faker',
-            'flake8',
-            'pytest',
+            'hypothesis==6.82.7',
+            'faker==20.1.0',
+            'flake8==6.1.0',
+            'pytest==7.5.0',
+        ],
+        'test': [
+            'coverage==7.3.1',
+            'pytest-cov==4.1.0',
         ],
     },
     classifiers=[
         'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
+        'Programming Language :: Python :: 3.12',
         'License :: OSI Approved :: MIT License',
         'Operating System :: OS Independent',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Text Processing',
     ],
     python_requires='>=3.7',
     entry_points={
@@ -41,4 +53,8 @@
         ],
     },
     test_suite='tests',
+    keywords='text cleaning, text preprocessing, NLP, natural language processing',
+    package_data={
+        '': ['*.txt', '*.rst', '*.md'],
+    },
 )