Skip to content

Commit

Permalink
Merge pull request #24 from elimu-ai/21-perform-exploratory-data-anal…
Browse files Browse the repository at this point in the history
…ysis-eda

Visualize chapter count, paragraph count, word count per reading level
  • Loading branch information
jo-elimu authored Sep 12, 2024
2 parents 4c90ed4 + 6ea7a38 commit 4ac0027
Show file tree
Hide file tree
Showing 17 changed files with 166 additions and 60 deletions.
1 change: 1 addition & 0 deletions .github/workflows/pmml-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
- name: Git Commit
run: |
git add step1_prepare/*.csv
git add step1_prepare/*.png
git add step2_train/*.pmml
git add step3_predict/*.csv
git add step3_predict/*.txt
Expand Down
1 change: 1 addition & 0 deletions pmml/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
matplotlib==3.8.2
numpy==1.26.4
pandas==2.2.2
pypmml==0.9.17
Expand Down
1 change: 1 addition & 0 deletions pmml/run_all_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import step1_prepare.step1_1_download_data
import step1_prepare.step1_2_preprocess_data
import step1_prepare.step1_3_split_data
import step1_prepare.step1_4_eda

print('\n*** Step 2. Train Model ***')
os.chdir('../step2_train')
Expand Down
25 changes: 25 additions & 0 deletions pmml/step1_prepare/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Data Visualizations

## `chapter_count`

![](./step1_4_chapter_count_scatter.png)

![](./step1_4_chapter_count_hist.png)

## `paragraph_count`

![](./step1_4_paragraph_count_scatter.png)

![](./step1_4_paragraph_count_hist.png)

## `word_count`

![](./step1_4_word_count_scatter.png)

![](./step1_4_word_count_hist.png)

## `avg_word_length`

![](./step1_4_avg_word_length_scatter.png)

![](./step1_4_avg_word_length_hist.png)
6 changes: 5 additions & 1 deletion pmml/step1_prepare/step1_2_preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
storybooks_dataframe['chapter_count'] = 0
storybooks_dataframe['paragraph_count'] = 0
storybooks_dataframe['word_count'] = 0
storybooks_dataframe['avg_word_length'] = 0
for index in storybooks_dataframe.index:
print(basename(__file__), f'index: {index}')
chapters = storybooks_dataframe.loc[index]['chapters']
Expand All @@ -29,10 +30,13 @@

word_count = chapters_utils.get_word_count(chapters_json)
storybooks_dataframe.loc[index, 'word_count'] = word_count

avg_word_length = chapters_utils.get_avg_word_length(chapters_json)
storybooks_dataframe.loc[index, 'avg_word_length'] = avg_word_length
print(basename(__file__), f'storybooks_dataframe (after extracting data from `chapters` column): \n{storybooks_dataframe}')

# Drop unnecessary columns
storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count', 'paragraph_count', 'word_count']]
storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count', 'paragraph_count', 'word_count', 'avg_word_length']]
print(basename(__file__), f'storybooks_dataframe (after dropping unnecessary columns): \n{storybooks_dataframe}')

# Drop missing values
Expand Down
118 changes: 59 additions & 59 deletions pmml/step1_prepare/step1_2_storybooks.csv
Original file line number Diff line number Diff line change
@@ -1,59 +1,59 @@
id,reading_level,chapter_count,paragraph_count,word_count
37,1,11,20,26
1,1,7,17,167
36,1,11,29,138
59,2,16,29,129
25,3,19,54,1596
29,1,11,13,110
63,1,15,13,101
2,1,7,11,123
15,3,23,47,1291
6,1,7,7,78
53,2,15,62,486
66,2,15,44,442
56,2,11,37,398
4,1,12,25,117
42,1,15,37,264
61,2,17,29,178
50,2,18,48,811
32,1,10,25,209
11,1,12,31,194
26,2,7,11,393
51,2,19,48,623
7,1,11,13,144
64,2,4,25,85
55,2,18,52,461
34,1,11,19,148
47,1,16,18,117
24,3,14,20,731
27,2,11,23,211
46,1,5,5,81
65,2,12,30,426
60,2,11,42,427
17,3,9,10,374
19,3,15,39,1342
39,1,13,15,177
5,1,11,21,116
28,1,10,17,95
20,3,13,22,672
30,1,13,28,201
22,4,23,75,2057
57,2,16,22,133
33,1,14,13,23
8,1,11,12,150
9,1,11,11,107
16,1,11,13,120
41,4,10,30,555
10,1,6,18,101
58,2,11,16,131
49,1,9,23,258
52,2,20,41,559
18,3,17,38,564
40,1,4,16,89
14,2,14,40,488
23,4,16,55,1507
38,1,14,20,140
54,2,11,28,211
21,3,12,26,965
31,1,4,4,20
48,1,8,15,176
id,reading_level,chapter_count,paragraph_count,word_count,avg_word_length
37,1,11,20,26,3.5
1,1,7,17,167,3.784431137724551
36,1,11,29,138,4.115942028985507
59,2,16,29,129,4.10077519379845
25,3,19,54,1596,3.7042606516290726
29,1,11,13,110,3.536363636363636
63,1,15,13,101,3.742574257425743
2,1,7,11,123,3.8292682926829267
15,3,23,47,1291,3.5716498838109993
6,1,7,7,78,3.5384615384615383
53,2,15,62,486,3.707818930041152
66,2,15,44,442,4.009049773755656
56,2,11,37,398,3.6733668341708543
4,1,12,25,117,3.2051282051282053
42,1,15,37,264,3.7007575757575757
61,2,17,29,178,3.741573033707865
50,2,18,48,811,3.6757090012330456
32,1,10,25,209,3.7416267942583734
11,1,12,31,194,3.2989690721649483
26,2,7,11,393,3.6259541984732824
51,2,19,48,623,3.874799357945425
7,1,11,13,144,3.5555555555555554
64,2,4,25,85,3.847058823529412
55,2,18,52,461,3.6594360086767894
34,1,11,19,148,3.7635135135135136
47,1,16,18,117,3.4358974358974357
24,3,14,20,731,3.753761969904241
27,2,11,23,211,3.853080568720379
46,1,5,5,81,3.691358024691358
65,2,12,30,426,3.5985915492957745
60,2,11,42,427,3.7423887587822016
17,3,9,10,374,3.72192513368984
19,3,15,39,1342,3.4865871833084947
39,1,13,15,177,3.6271186440677967
5,1,11,21,116,4.646551724137931
28,1,10,17,95,4.578947368421052
20,3,13,22,672,3.6845238095238093
30,1,13,28,201,3.5970149253731343
22,4,23,75,2057,3.631988332523092
57,2,16,22,133,3.56390977443609
33,1,14,13,23,6.695652173913044
8,1,11,12,150,3.2933333333333334
9,1,11,11,107,2.9158878504672896
16,1,11,13,120,3.6416666666666666
41,4,10,30,555,3.7495495495495494
10,1,6,18,101,3.1881188118811883
58,2,11,16,131,4.175572519083969
49,1,9,23,258,3.689922480620155
52,2,20,41,559,3.695885509838998
18,3,17,38,564,3.75177304964539
40,1,4,16,89,3.831460674157303
14,2,14,40,488,3.8155737704918034
23,4,16,55,1507,3.727272727272727
38,1,14,20,140,3.8642857142857143
54,2,11,28,211,3.6587677725118484
21,3,12,26,965,3.5761658031088084
31,1,4,4,20,3.85
48,1,8,15,176,3.727272727272727
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
32 changes: 32 additions & 0 deletions pmml/step1_prepare/step1_4_eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from os.path import basename
from matplotlib import pyplot
import pandas

# Read the preprocessed storybooks CSV into a DataFrame
storybooks_csv_path = 'step1_2_storybooks.csv'
print(basename(__file__), f'storybooks_csv_path: {storybooks_csv_path}')
storybooks_dataframe = pandas.read_csv(storybooks_csv_path)

for label in storybooks_dataframe.columns[2:]:
print(f'label: {label}')

# Scatter
pyplot.scatter(storybooks_dataframe['reading_level'], storybooks_dataframe[label], alpha=0.5)
pyplot.xlabel('reading_level')
pyplot.ylabel(label)
pyplot.savefig(f'step1_4_{label}_scatter.png')
pyplot.clf()

# Histogram
level_1 = storybooks_dataframe[storybooks_dataframe['reading_level'] == 1][label]
level_2 = storybooks_dataframe[storybooks_dataframe['reading_level'] == 2][label]
level_3 = storybooks_dataframe[storybooks_dataframe['reading_level'] == 3][label]
level_4 = storybooks_dataframe[storybooks_dataframe['reading_level'] == 4][label]
pyplot.hist(level_1, label='LEVEL1', alpha=0.5)
pyplot.hist(level_2, label='LEVEL2', alpha=0.5)
pyplot.hist(level_3, label='LEVEL3', alpha=0.5)
pyplot.hist(level_4, label='LEVEL4', alpha=0.5)
pyplot.xlabel(label)
pyplot.legend()
pyplot.savefig(f'step1_4_{label}_hist.png')
pyplot.clf()
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added pmml/step1_prepare/step1_4_word_count_hist.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added pmml/step1_prepare/step1_4_word_count_scatter.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
24 changes: 24 additions & 0 deletions pmml/utils/chapters_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from os.path import basename
import string

'''json
[
Expand Down Expand Up @@ -73,3 +74,26 @@ def get_word_count(chapters_json):
print(basename(__file__), f'words: {words}')
word_count += len(words)
return word_count

def get_avg_word_length(chapters_json):
print(basename(__file__), 'get_word_count')
word_count = 0
total_word_length = 0
for chapter in chapters_json:
print(basename(__file__), f'chapter["sortOrder"]: {chapter["sortOrder"]}')
for paragraph in chapter["storyBookParagraphs"]:
print(basename(__file__), f'paragraph: {paragraph}')
words = paragraph["originalText"].split()
print(basename(__file__), f'words: {words}')
word_count += len(words)
for word in words:
print(basename(__file__), f'word: {word}')
word_cleaned = word.translate(str.maketrans('', '', string.punctuation))
print(basename(__file__), f'word_cleaned: {word_cleaned}')
total_word_length += len(word_cleaned)
print(basename(__file__), f'word_count: {word_count}')
print(basename(__file__), f'total_word_length: {total_word_length}')
if (word_count == 0):
return 0
else:
return total_word_length / word_count
18 changes: 18 additions & 0 deletions pmml/utils/test_chapters_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,21 @@ def test_get_word_count_when_multiple_paragraphs():
paragraph_count = chapters_utils.get_word_count(chapters_json)

assert paragraph_count == 232

def test_get_avg_word_length_when_no_paragraphs():
# Load JSON
chapters_json = [{"image":{"id":467},"sortOrder":0,"id":120,"storyBookParagraphs":[]},{"image":{"id":468},"sortOrder":1,"id":121,"storyBookParagraphs":[]},{"image":{"id":469},"sortOrder":2,"id":122,"storyBookParagraphs":[]},{"image":{"id":470},"sortOrder":3,"id":123,"storyBookParagraphs":[]},{"image":{"id":471},"sortOrder":4,"id":124,"storyBookParagraphs":[]},{"image":{"id":472},"sortOrder":5,"id":125,"storyBookParagraphs":[]},{"image":{"id":473},"sortOrder":6,"id":126,"storyBookParagraphs":[]},{"image":{"id":474},"sortOrder":7,"id":127,"storyBookParagraphs":[]},{"image":{"id":475},"sortOrder":8,"id":128,"storyBookParagraphs":[]},{"image":{"id":476},"sortOrder":9,"id":129,"storyBookParagraphs":[]},{"image":{"id":477},"sortOrder":10,"id":130,"storyBookParagraphs":[]},{"image":{"id":478},"sortOrder":11,"id":131,"storyBookParagraphs":[]},{"image":{"id":479},"sortOrder":12,"id":132,"storyBookParagraphs":[]},{"image":{"id":480},"sortOrder":13,"id":133,"storyBookParagraphs":[]},{"image":{"id":481},"sortOrder":14,"id":134,"storyBookParagraphs":[]},{"image":{"id":482},"sortOrder":15,"id":135,"storyBookParagraphs":[]},{"image":{"id":483},"sortOrder":16,"id":136,"storyBookParagraphs":[]},{"image":{"id":484},"sortOrder":17,"id":137,"storyBookParagraphs":[]},{"image":{"id":485},"sortOrder":18,"id":138,"storyBookParagraphs":[]},{"image":{"id":486},"sortOrder":19,"id":139,"storyBookParagraphs":[]},{"image":{"id":487},"sortOrder":20,"id":140,"storyBookParagraphs":[]},{"image":{"id":488},"sortOrder":21,"id":141,"storyBookParagraphs":[]},{"image":{"id":489},"sortOrder":22,"id":142,"storyBookParagraphs":[]},{"image":{"id":490},"sortOrder":23,"id":143,"storyBookParagraphs":[]},{"image":{"id":491},"sortOrder":24,"id":144,"storyBookParagraphs":[]},{"image":{"id":492},"sortOrder":25,"id":145,"storyBookParagraphs":[]},{"image":{"id":493},"sortOrder":26,"id":146,"storyBookParagraphs":[]}]

# Extract the average word length
avg_word_length = chapters_utils.get_avg_word_length(chapters_json)

assert avg_word_length == 0

def test_get_avg_word_length_when_single_paragraphs():
# Load JSON
chapters_json = [{"image":{"id":447},"sortOrder":0,"id":99,"storyBookParagraphs":[{"originalText":"Earth is the planet that we live on. Currently no other planet is known to contain life.","sortOrder":0,"id":142}]},{"image":{"id":448},"sortOrder":1,"id":100,"storyBookParagraphs":[{"originalText":"The Earth is in danger because of global warming. Global warming is caused by too much carbon dioxide in the atmosphere. Carbon dioxide is a gas which traps heat in the Earth. Without it Earth\'s heat would flow out and Earth would freeze.","sortOrder":0,"id":143}]},{"image":{"id":449},"sortOrder":2,"id":101,"storyBookParagraphs":[{"originalText":"The cars we drive create lots of carbon dioxide. We should walk more or ride a bicycle.","sortOrder":0,"id":144}]}]

# Extract the average word length
avg_word_length = chapters_utils.get_avg_word_length(chapters_json)

assert avg_word_length == 4.285714285714286

0 comments on commit 4ac0027

Please sign in to comment.