Addded meaningfiul examples

rhnfzl · Nov 14, 2024 · 0ad3bc6 · 0ad3bc6
1 parent 990668d
commit 0ad3bc6
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -167,4 +167,5 @@ sct/scripts/__pycache__
 tests/.hypothesis
 SqueakyCleanText.egg-info
 test_performance.py
-OldSqueakyCleanText/
+OldSqueakyCleanText/
+snap.py
diff --git a/README.md b/README.md
@@ -77,19 +77,27 @@ pip install SqueakyCleanText
 ## Usage
 
 ### Basic Usage
+
 ```python
-from sct import sct
+from sct import sct, config
 
 # Initialize the TextCleaner
-sx = sct.TextCleaner()
+cleaner = sct.TextCleaner()
+
+# Input text
+text = "Contact John Doe at [email protected]. Meeting on 2023-10-01."
 
-# Process single text
-text = "Hey John Doe, email me at [email protected]"
-lm_text, stat_text, language = sx.process(text)
+# Process the text
+lm_text, stat_text, lang = cleaner.process(text)
 
-# Process multiple texts efficiently
-texts = ["Text 1", "Text 2", "Text 3"]
-results = sx.process_batch(texts, batch_size=2)
+print(f"Language Model format:    {lm_text}")
+# Output: "Contact <PERSON> at <EMAIL>. Meeting on <DATE>."
+
+print(f"Statistical Model format: {stat_text}")
+# Output: "contact meeting"
+
+print(f"Detected Language: {lang}")
+# Output: "ENGLISH"
 ```
 
 ### Advanced Configuration
@@ -113,6 +121,39 @@ config.LANGUAGE = "ENGLISH"  # Options: ENGLISH, DUTCH, GERMAN, SPANISH
 sx = sct.TextCleaner()
 ```
 
+
+### Batch Processing with Custom Configuration**
+
+```python
+from sct import sct, config
+
+# Customize configuration
+config.CHECK_REMOVE_STOPWORDS = True
+config.CHECK_REMOVE_PUNCTUATION = True
+config.CHECK_NER_PROCESS = True
+config.POSITIONAL_TAGS = ['PERSON', 'ORG', 'LOC']
+config.NER_CONFIDENCE_THRESHOLD = 0.90
+
+# Initialize the TextCleaner with custom config
+cleaner = sct.TextCleaner()
+
+# Sample texts
+texts = [
+    "Email [email protected] for more info.",  # Spanish
+    "Besuchen Sie uns im Büro in Berlin.",           # German
+    "Voor vragen, bel +31 20 123 4567.",             # Dutch
+]
+
+# Process texts in batch
+results = cleaner.process_batch(texts, batch_size=2)
+
+for lm_text, stat_text, lang in results:
+    print(f"Language: {lang}")
+    print(f"LM Format:    {lm_text}")
+    print(f"Stat Format:  {stat_text}")
+    print("-" * 40)
+```
+
 ## API
 
 ### `sct.TextCleaner`