Merge pull request #24 from elimu-ai/21-perform-exploratory-data-anal…

…ysis-eda Visualize chapter count, paragraph count, word count per reading level
elimu-ai · Sep 12, 2024 · 4ac0027 · 4ac0027
2 parents 4c90ed4 + 6ea7a38
commit 4ac0027
Show file tree

Hide file tree

Showing 17 changed files with 166 additions and 60 deletions.
diff --git a/.github/workflows/pmml-nightly.yml b/.github/workflows/pmml-nightly.yml
@@ -30,6 +30,7 @@ jobs:
     - name: Git Commit
       run: |
         git add step1_prepare/*.csv
+        git add step1_prepare/*.png
         git add step2_train/*.pmml
         git add step3_predict/*.csv
         git add step3_predict/*.txt

diff --git a/pmml/requirements.txt b/pmml/requirements.txt
@@ -1,3 +1,4 @@
+matplotlib==3.8.2
 numpy==1.26.4
 pandas==2.2.2
 pypmml==0.9.17

diff --git a/pmml/run_all_steps.py b/pmml/run_all_steps.py
@@ -6,6 +6,7 @@
 import step1_prepare.step1_1_download_data
 import step1_prepare.step1_2_preprocess_data
 import step1_prepare.step1_3_split_data
+import step1_prepare.step1_4_eda
 
 print('\n*** Step 2. Train Model ***')
 os.chdir('../step2_train')

diff --git a/pmml/step1_prepare/README.md b/pmml/step1_prepare/README.md
@@ -0,0 +1,25 @@
+# Data Visualizations
+
+## `chapter_count`
+
+![](./step1_4_chapter_count_scatter.png)
+
+![](./step1_4_chapter_count_hist.png)
+
+## `paragraph_count`
+
+![](./step1_4_paragraph_count_scatter.png)
+
+![](./step1_4_paragraph_count_hist.png)
+
+## `word_count`
+
+![](./step1_4_word_count_scatter.png)
+
+![](./step1_4_word_count_hist.png)
+
+## `avg_word_length`
+
+![](./step1_4_avg_word_length_scatter.png)
+
+![](./step1_4_avg_word_length_hist.png)
diff --git a/pmml/step1_prepare/step1_2_preprocess_data.py b/pmml/step1_prepare/step1_2_preprocess_data.py
@@ -16,6 +16,7 @@
 storybooks_dataframe['chapter_count'] = 0
 storybooks_dataframe['paragraph_count'] = 0
 storybooks_dataframe['word_count'] = 0
+storybooks_dataframe['avg_word_length'] = 0
 for index in storybooks_dataframe.index:
     print(basename(__file__), f'index: {index}')
     chapters = storybooks_dataframe.loc[index]['chapters']
@@ -29,10 +30,13 @@
 
     word_count = chapters_utils.get_word_count(chapters_json)
     storybooks_dataframe.loc[index, 'word_count'] = word_count
+
+    avg_word_length = chapters_utils.get_avg_word_length(chapters_json)
+    storybooks_dataframe.loc[index, 'avg_word_length'] = avg_word_length
 print(basename(__file__), f'storybooks_dataframe (after extracting data from `chapters` column): \n{storybooks_dataframe}')
 
 # Drop unnecessary columns
-storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count', 'paragraph_count', 'word_count']]
+storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count', 'paragraph_count', 'word_count', 'avg_word_length']]
 print(basename(__file__), f'storybooks_dataframe (after dropping unnecessary columns): \n{storybooks_dataframe}')
 
 # Drop missing values

diff --git a/pmml/step1_prepare/step1_2_storybooks.csv b/pmml/step1_prepare/step1_2_storybooks.csv
@@ -1,59 +1,59 @@
-id,reading_level,chapter_count,paragraph_count,word_count
-37,1,11,20,26
-1,1,7,17,167
-36,1,11,29,138
-59,2,16,29,129
-25,3,19,54,1596
-29,1,11,13,110
-63,1,15,13,101
-2,1,7,11,123
-15,3,23,47,1291
-6,1,7,7,78
-53,2,15,62,486
-66,2,15,44,442
-56,2,11,37,398
-4,1,12,25,117
-42,1,15,37,264
-61,2,17,29,178
-50,2,18,48,811
-32,1,10,25,209
-11,1,12,31,194
-26,2,7,11,393
-51,2,19,48,623
-7,1,11,13,144
-64,2,4,25,85
-55,2,18,52,461
-34,1,11,19,148
-47,1,16,18,117
-24,3,14,20,731
-27,2,11,23,211
-46,1,5,5,81
-65,2,12,30,426
-60,2,11,42,427
-17,3,9,10,374
-19,3,15,39,1342
-39,1,13,15,177
-5,1,11,21,116
-28,1,10,17,95
-20,3,13,22,672
-30,1,13,28,201
-22,4,23,75,2057
-57,2,16,22,133
-33,1,14,13,23
-8,1,11,12,150
-9,1,11,11,107
-16,1,11,13,120
-41,4,10,30,555
-10,1,6,18,101
-58,2,11,16,131
-49,1,9,23,258
-52,2,20,41,559
-18,3,17,38,564
-40,1,4,16,89
-14,2,14,40,488
-23,4,16,55,1507
-38,1,14,20,140
-54,2,11,28,211
-21,3,12,26,965
-31,1,4,4,20
-48,1,8,15,176
+id,reading_level,chapter_count,paragraph_count,word_count,avg_word_length
+37,1,11,20,26,3.5
+1,1,7,17,167,3.784431137724551
+36,1,11,29,138,4.115942028985507
+59,2,16,29,129,4.10077519379845
+25,3,19,54,1596,3.7042606516290726
+29,1,11,13,110,3.536363636363636
+63,1,15,13,101,3.742574257425743
+2,1,7,11,123,3.8292682926829267
+15,3,23,47,1291,3.5716498838109993
+6,1,7,7,78,3.5384615384615383
+53,2,15,62,486,3.707818930041152
+66,2,15,44,442,4.009049773755656
+56,2,11,37,398,3.6733668341708543
+4,1,12,25,117,3.2051282051282053
+42,1,15,37,264,3.7007575757575757
+61,2,17,29,178,3.741573033707865
+50,2,18,48,811,3.6757090012330456
+32,1,10,25,209,3.7416267942583734
+11,1,12,31,194,3.2989690721649483
+26,2,7,11,393,3.6259541984732824
+51,2,19,48,623,3.874799357945425
+7,1,11,13,144,3.5555555555555554
+64,2,4,25,85,3.847058823529412
+55,2,18,52,461,3.6594360086767894
+34,1,11,19,148,3.7635135135135136
+47,1,16,18,117,3.4358974358974357
+24,3,14,20,731,3.753761969904241
+27,2,11,23,211,3.853080568720379
+46,1,5,5,81,3.691358024691358
+65,2,12,30,426,3.5985915492957745
+60,2,11,42,427,3.7423887587822016
+17,3,9,10,374,3.72192513368984
+19,3,15,39,1342,3.4865871833084947
+39,1,13,15,177,3.6271186440677967
+5,1,11,21,116,4.646551724137931
+28,1,10,17,95,4.578947368421052
+20,3,13,22,672,3.6845238095238093
+30,1,13,28,201,3.5970149253731343
+22,4,23,75,2057,3.631988332523092
+57,2,16,22,133,3.56390977443609
+33,1,14,13,23,6.695652173913044
+8,1,11,12,150,3.2933333333333334
+9,1,11,11,107,2.9158878504672896
+16,1,11,13,120,3.6416666666666666
+41,4,10,30,555,3.7495495495495494
+10,1,6,18,101,3.1881188118811883
+58,2,11,16,131,4.175572519083969
+49,1,9,23,258,3.689922480620155
+52,2,20,41,559,3.695885509838998
+18,3,17,38,564,3.75177304964539
+40,1,4,16,89,3.831460674157303
+14,2,14,40,488,3.8155737704918034
+23,4,16,55,1507,3.727272727272727
+38,1,14,20,140,3.8642857142857143
+54,2,11,28,211,3.6587677725118484
+21,3,12,26,965,3.5761658031088084
+31,1,4,4,20,3.85
+48,1,8,15,176,3.727272727272727
diff --git a/pmml/step1_prepare/step1_4_avg_word_length_hist.png b/pmml/step1_prepare/step1_4_avg_word_length_hist.png
diff --git a/pmml/step1_prepare/step1_4_avg_word_length_scatter.png b/pmml/step1_prepare/step1_4_avg_word_length_scatter.png
diff --git a/pmml/step1_prepare/step1_4_chapter_count_hist.png b/pmml/step1_prepare/step1_4_chapter_count_hist.png
diff --git a/pmml/step1_prepare/step1_4_chapter_count_scatter.png b/pmml/step1_prepare/step1_4_chapter_count_scatter.png
diff --git a/pmml/step1_prepare/step1_4_eda.py b/pmml/step1_prepare/step1_4_eda.py
@@ -0,0 +1,32 @@
+from os.path import basename
+from matplotlib import pyplot
+import pandas
+
+# Read the preprocessed storybooks CSV into a DataFrame
+storybooks_csv_path = 'step1_2_storybooks.csv'
+print(basename(__file__), f'storybooks_csv_path: {storybooks_csv_path}')
+storybooks_dataframe = pandas.read_csv(storybooks_csv_path)
+
+for label in storybooks_dataframe.columns[2:]:
+    print(f'label: {label}')
+
+    # Scatter
+    pyplot.scatter(storybooks_dataframe['reading_level'], storybooks_dataframe[label], alpha=0.5)
+    pyplot.xlabel('reading_level')
+    pyplot.ylabel(label)
+    pyplot.savefig(f'step1_4_{label}_scatter.png')
+    pyplot.clf()
+
+    # Histogram
+    level_1 = storybooks_dataframe[storybooks_dataframe['reading_level'] == 1][label]
+    level_2 = storybooks_dataframe[storybooks_dataframe['reading_level'] == 2][label]
+    level_3 = storybooks_dataframe[storybooks_dataframe['reading_level'] == 3][label]
+    level_4 = storybooks_dataframe[storybooks_dataframe['reading_level'] == 4][label]
+    pyplot.hist(level_1, label='LEVEL1', alpha=0.5)
+    pyplot.hist(level_2, label='LEVEL2', alpha=0.5)
+    pyplot.hist(level_3, label='LEVEL3', alpha=0.5)
+    pyplot.hist(level_4, label='LEVEL4', alpha=0.5)
+    pyplot.xlabel(label)
+    pyplot.legend()
+    pyplot.savefig(f'step1_4_{label}_hist.png')
+    pyplot.clf()
diff --git a/pmml/step1_prepare/step1_4_paragraph_count_hist.png b/pmml/step1_prepare/step1_4_paragraph_count_hist.png
diff --git a/pmml/step1_prepare/step1_4_paragraph_count_scatter.png b/pmml/step1_prepare/step1_4_paragraph_count_scatter.png
diff --git a/pmml/step1_prepare/step1_4_word_count_hist.png b/pmml/step1_prepare/step1_4_word_count_hist.png
diff --git a/pmml/step1_prepare/step1_4_word_count_scatter.png b/pmml/step1_prepare/step1_4_word_count_scatter.png
diff --git a/pmml/utils/chapters_utils.py b/pmml/utils/chapters_utils.py
@@ -1,4 +1,5 @@
 from os.path import basename
+import string
 
 '''json
 [
@@ -73,3 +74,26 @@ def get_word_count(chapters_json):
             print(basename(__file__), f'words: {words}')
             word_count += len(words)
     return word_count
+
+def get_avg_word_length(chapters_json):
+    print(basename(__file__), 'get_word_count')
+    word_count = 0
+    total_word_length = 0
+    for chapter in chapters_json:
+        print(basename(__file__), f'chapter["sortOrder"]: {chapter["sortOrder"]}')
+        for paragraph in chapter["storyBookParagraphs"]:
+            print(basename(__file__), f'paragraph: {paragraph}')
+            words = paragraph["originalText"].split()
+            print(basename(__file__), f'words: {words}')
+            word_count += len(words)
+            for word in words:
+                print(basename(__file__), f'word: {word}')
+                word_cleaned = word.translate(str.maketrans('', '', string.punctuation))
+                print(basename(__file__), f'word_cleaned: {word_cleaned}')
+                total_word_length += len(word_cleaned)
+    print(basename(__file__), f'word_count: {word_count}')
+    print(basename(__file__), f'total_word_length: {total_word_length}')
+    if (word_count == 0):
+        return 0
+    else:
+        return total_word_length / word_count
diff --git a/pmml/utils/test_chapters_utils.py b/pmml/utils/test_chapters_utils.py
@@ -80,3 +80,21 @@ def test_get_word_count_when_multiple_paragraphs():
     paragraph_count = chapters_utils.get_word_count(chapters_json)
 
     assert paragraph_count == 232
+
+def test_get_avg_word_length_when_no_paragraphs():
+    # Load JSON
+    chapters_json = [{"image":{"id":467},"sortOrder":0,"id":120,"storyBookParagraphs":[]},{"image":{"id":468},"sortOrder":1,"id":121,"storyBookParagraphs":[]},{"image":{"id":469},"sortOrder":2,"id":122,"storyBookParagraphs":[]},{"image":{"id":470},"sortOrder":3,"id":123,"storyBookParagraphs":[]},{"image":{"id":471},"sortOrder":4,"id":124,"storyBookParagraphs":[]},{"image":{"id":472},"sortOrder":5,"id":125,"storyBookParagraphs":[]},{"image":{"id":473},"sortOrder":6,"id":126,"storyBookParagraphs":[]},{"image":{"id":474},"sortOrder":7,"id":127,"storyBookParagraphs":[]},{"image":{"id":475},"sortOrder":8,"id":128,"storyBookParagraphs":[]},{"image":{"id":476},"sortOrder":9,"id":129,"storyBookParagraphs":[]},{"image":{"id":477},"sortOrder":10,"id":130,"storyBookParagraphs":[]},{"image":{"id":478},"sortOrder":11,"id":131,"storyBookParagraphs":[]},{"image":{"id":479},"sortOrder":12,"id":132,"storyBookParagraphs":[]},{"image":{"id":480},"sortOrder":13,"id":133,"storyBookParagraphs":[]},{"image":{"id":481},"sortOrder":14,"id":134,"storyBookParagraphs":[]},{"image":{"id":482},"sortOrder":15,"id":135,"storyBookParagraphs":[]},{"image":{"id":483},"sortOrder":16,"id":136,"storyBookParagraphs":[]},{"image":{"id":484},"sortOrder":17,"id":137,"storyBookParagraphs":[]},{"image":{"id":485},"sortOrder":18,"id":138,"storyBookParagraphs":[]},{"image":{"id":486},"sortOrder":19,"id":139,"storyBookParagraphs":[]},{"image":{"id":487},"sortOrder":20,"id":140,"storyBookParagraphs":[]},{"image":{"id":488},"sortOrder":21,"id":141,"storyBookParagraphs":[]},{"image":{"id":489},"sortOrder":22,"id":142,"storyBookParagraphs":[]},{"image":{"id":490},"sortOrder":23,"id":143,"storyBookParagraphs":[]},{"image":{"id":491},"sortOrder":24,"id":144,"storyBookParagraphs":[]},{"image":{"id":492},"sortOrder":25,"id":145,"storyBookParagraphs":[]},{"image":{"id":493},"sortOrder":26,"id":146,"storyBookParagraphs":[]}]
+
+    # Extract the average word length
+    avg_word_length = chapters_utils.get_avg_word_length(chapters_json)
+
+    assert avg_word_length == 0
+
+def test_get_avg_word_length_when_single_paragraphs():
+    # Load JSON
+    chapters_json = [{"image":{"id":447},"sortOrder":0,"id":99,"storyBookParagraphs":[{"originalText":"Earth is the planet that we live on. Currently no other planet is known to contain life.","sortOrder":0,"id":142}]},{"image":{"id":448},"sortOrder":1,"id":100,"storyBookParagraphs":[{"originalText":"The Earth is in danger because of global warming. Global warming is caused by too much carbon dioxide in the atmosphere. Carbon dioxide is a gas which traps heat in the Earth. Without it Earth\'s heat would flow out and Earth would freeze.","sortOrder":0,"id":143}]},{"image":{"id":449},"sortOrder":2,"id":101,"storyBookParagraphs":[{"originalText":"The cars we drive create lots of carbon dioxide. We should walk more or ride a bicycle.","sortOrder":0,"id":144}]}]
+
+    # Extract the average word length
+    avg_word_length = chapters_utils.get_avg_word_length(chapters_json)
+
+    assert avg_word_length == 4.285714285714286