Skip to content

Commit

Permalink
added the ability in the config to change the model name
Browse files Browse the repository at this point in the history
  • Loading branch information
rhnfzl committed Aug 17, 2024
1 parent 17cc400 commit 8f849f2
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 59 deletions.
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# `SqueakyCleanText`

[![PyPI](https://img.shields.io/pypi/v/squeakycleantext.svg)](https://pypi.org/project/squeakycleantext/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/squeakycleantext.svg)](https://pypi.org/project/squeakycleantext/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/squeakycleantext)](https://pypistats.org/packages/squeakycleantext)
[![PyPI](https://img.shields.io/pypi/v/squeakycleantext.svg)](https://pypi.org/project/squeakycleantext/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/squeakycleantext)](https://pypistats.org/packages/squeakycleantext)

In the world of machine learning and natural language processing, clean and well-structured text data is crucial for building effective downstream models and managing token limits in language models.

Expand All @@ -16,6 +16,7 @@ SqueakyCleanText helps achieve this by addressing common text issues by doing mo
- Currency Symbols: Replaces all currency symbols with their alphabetical equivalents.
- Whitespace Normalization: Removes unnecessary whitespace.
- Detects the language of processed text if needed in downstream task.
- Supports English, Dutch, German and Spanish language.
- Provides text for both Lnaguage model processing and Statistical model processing.

##### Benefits for Statistical Models
Expand Down Expand Up @@ -118,11 +119,6 @@ Processes the input text and returns a tuple containing:
- Cleaned text with stopwords removed.
- Detected language of the text.

## TODO

- Add the ability to change the NER models from the config file, which AutoModel and AutoTokenizer.
- Expand language support for stopwords to more European Languages.

## Contributing

Contributions are welcome! Please feel free to submit a Pull Request or open an issue.
Expand Down
8 changes: 7 additions & 1 deletion sct/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,10 @@
REPLACE_WITH_CURRENCY_SYMBOLS = None
POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
NER_CONFIDENCE_THRESHOLD = 0.85
LANGUAGE = None
LANGUAGE = None

NER_MODELS_LIST = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
"FacebookAI/xlm-roberta-large-finetuned-conll03-german",
"FacebookAI/xlm-roberta-large-finetuned-conll03-spanish",
"Babelscape/wikineural-multilingual-ner"]
79 changes: 42 additions & 37 deletions sct/utils/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from presidio_anonymizer.entities import RecognizerResult

from sct.utils import constants
from sct import config
transformers.logging.set_verbosity_error()

class GeneralNER:
Expand All @@ -23,44 +24,48 @@ def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"

self.engine = AnonymizerEngine()

if len(config.NER_MODELS_LIST) == 5:
english_model_name = config.NER_MODELS_LIST[0]
dutch_model_name = config.NER_MODELS_LIST[1]
german_model_name = config.NER_MODELS_LIST[2]
spanish_model_name = config.NER_MODELS_LIST[3]
multilingual_model_name = config.NER_MODELS_LIST[4]
else:
# Load models
model_name = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
"FacebookAI/xlm-roberta-large-finetuned-conll03-german",
"FacebookAI/xlm-roberta-large-finetuned-conll03-spanish",
"Babelscape/wikineural-multilingual-ner"]

english_model_name = model_name[0]
dutch_model_name = model_name[1]
german_model_name = model_name[2]
spanish_model_name = model_name[3]
multilingual_model_name = model_name[4]

self.en_tokenizer = AutoTokenizer.from_pretrained(english_model_name)
self.en_model = AutoModelForTokenClassification.from_pretrained(english_model_name).to(self.device)
self.en_ner_pipeline = pipeline("ner", model=self.en_model, tokenizer=self.en_tokenizer, aggregation_strategy="simple")

self.nl_tokenizer = AutoTokenizer.from_pretrained(dutch_model_name)
self.nl_model = AutoModelForTokenClassification.from_pretrained(dutch_model_name).to(self.device)
self.nl_ner_pipeline = pipeline("ner", model=self.nl_model, tokenizer=self.nl_tokenizer, aggregation_strategy="simple")

self.de_tokenizer = AutoTokenizer.from_pretrained(german_model_name)
self.de_model = AutoModelForTokenClassification.from_pretrained(german_model_name).to(self.device)
self.de_ner_pipeline = pipeline("ner", model=self.de_model, tokenizer=self.de_tokenizer, aggregation_strategy="simple")

self.es_tokenizer = AutoTokenizer.from_pretrained(spanish_model_name)
self.es_model = AutoModelForTokenClassification.from_pretrained(spanish_model_name).to(self.device)
self.es_ner_pipeline = pipeline("ner", model=self.es_model, tokenizer=self.es_tokenizer, aggregation_strategy="simple")

self.multi_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_name)
self.multi_model = AutoModelForTokenClassification.from_pretrained(multilingual_model_name).to(self.device)
self.multi_ner_pipeline = pipeline("ner", model=self.multi_model, tokenizer=self.multi_tokenizer, aggregation_strategy="simple")

self.en_tokenizer = AutoTokenizer.from_pretrained(
"FacebookAI/xlm-roberta-large-finetuned-conll03-english")
self.en_model = AutoModelForTokenClassification.from_pretrained(
"FacebookAI/xlm-roberta-large-finetuned-conll03-english").to(self.device)
self.en_ner_pipeline = pipeline(
"ner", model=self.en_model, tokenizer=self.en_tokenizer, aggregation_strategy="simple")

self.nl_tokenizer = AutoTokenizer.from_pretrained(
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch")
self.nl_model = AutoModelForTokenClassification.from_pretrained(
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch").to(self.device)
self.nl_ner_pipeline = pipeline(
"ner", model=self.nl_model, tokenizer=self.nl_tokenizer, aggregation_strategy="simple")

self.de_tokenizer = AutoTokenizer.from_pretrained(
"FacebookAI/xlm-roberta-large-finetuned-conll03-german")
self.de_model = AutoModelForTokenClassification.from_pretrained(
"FacebookAI/xlm-roberta-large-finetuned-conll03-german").to(self.device)
self.de_ner_pipeline = pipeline(
"ner", model=self.de_model, tokenizer=self.de_tokenizer, aggregation_strategy="simple")

self.es_tokenizer = AutoTokenizer.from_pretrained(
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish")
self.es_model = AutoModelForTokenClassification.from_pretrained(
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish").to(self.device)
self.es_ner_pipeline = pipeline(
"ner", model=self.es_model, tokenizer=self.es_tokenizer, aggregation_strategy="simple")

self.multi_tokenizer = AutoTokenizer.from_pretrained(
"Babelscape/wikineural-multilingual-ner")
self.multi_model = AutoModelForTokenClassification.from_pretrained(
"Babelscape/wikineural-multilingual-ner").to(self.device)
self.multi_ner_pipeline = pipeline(
"ner", model=self.multi_model, tokenizer=self.multi_tokenizer, aggregation_strategy="simple")

self.min_token_length = math.ceil(min(
self.en_tokenizer.max_len_single_sentence, self.multi_tokenizer.max_len_single_sentence) * 0.9)
self.min_token_length = math.ceil(min(self.en_tokenizer.max_len_single_sentence, self.multi_tokenizer.max_len_single_sentence) * 0.9)

self.tokenizer = self.en_tokenizer if self.en_tokenizer.max_len_single_sentence <= self.multi_tokenizer.max_len_single_sentence else self.multi_tokenizer

Expand Down
46 changes: 31 additions & 15 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,49 @@

setup(
name='SqueakyCleanText',
version='0.1.8',
version='0.2.0',
author='Rehan Fazal',
description='A comprehensive text cleaning and preprocessing pipeline.',
long_description=open('README.md').read(),
long_description=open('README.md', encoding='utf-8').read(),
long_description_content_type='text/markdown',
url='https://github.com/rhnfzl/SqueakyCleanText',
license='MIT',
packages=find_packages(),
install_requires=[
'lingua-language-detector',
'nltk',
'emoji',
'ftfy',
'Unidecode',
'beautifulsoup4',
'transformers',
'torch',
'presidio_anonymizer',
'lingua-language-detector>=2.0.0,<2.1',
'nltk>=3.8,<3.9',
'emoji>=2.8,<2.9',
'ftfy>=6.1,<6.2',
'Unidecode>=1.3,<1.4',
'beautifulsoup4>=4.12,<4.13',
'transformers>=4.30,<4.31',
'torch>=2.0,<2.1',
'presidio_anonymizer>=2.2.355,<2.3',
],
extras_require={
'dev': [
'hypothesis',
'faker',
'flake8',
'pytest',
'hypothesis==6.82.7',
'faker==20.1.0',
'flake8==6.1.0',
'pytest==7.5.0',
],
'test': [
'coverage==7.3.1',
'pytest-cov==4.1.0',
],
},
classifiers=[
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Topic :: Software Development :: Libraries',
'Topic :: Text Processing',
],
python_requires='>=3.7',
entry_points={
Expand All @@ -41,4 +53,8 @@
],
},
test_suite='tests',
keywords='text cleaning, text preprocessing, NLP, natural language processing',
package_data={
'': ['*.txt', '*.rst', '*.md'],
},
)

0 comments on commit 8f849f2

Please sign in to comment.