Skip to content

Commit

Permalink
add paragraph count
Browse files Browse the repository at this point in the history
  • Loading branch information
jo-elimu committed Aug 17, 2024
1 parent 1d62496 commit ea0e471
Show file tree
Hide file tree
Showing 14 changed files with 88 additions and 41 deletions.
7 changes: 6 additions & 1 deletion pmml/step1_prepare/step1_2_preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,21 @@

# For each row in the DataFrame, extract information from the JSON string stored in the `chapters` column
storybooks_dataframe['chapter_count'] = 0
storybooks_dataframe['paragraph_count'] = 0
for index in storybooks_dataframe.index:
print(basename(__file__), f'index: {index}')
chapters = storybooks_dataframe.loc[index]['chapters']
chapters_json = json.loads(chapters)

chapter_count = chapters_utils.get_chapter_count(chapters_json)
storybooks_dataframe.loc[index, 'chapter_count'] = chapter_count

paragraph_count = chapters_utils.get_paragraph_count(chapters_json)
storybooks_dataframe.loc[index, 'paragraph_count'] = paragraph_count
print(basename(__file__), f'storybooks_dataframe (after extracting data from `chapters` column): \n{storybooks_dataframe}')

# Drop unnecessary columns
storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count']]
storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count', 'paragraph_count']]
print(basename(__file__), f'storybooks_dataframe (after dropping unnecessary columns): \n{storybooks_dataframe}')

# Drop missing values
Expand Down
18 changes: 9 additions & 9 deletions pmml/step1_prepare/step1_2_storybooks.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
id,reading_level,chapter_count
65,1,27
66,1,13
63,2,3
61,1,19
69,4,17
68,3,12
62,1,11
67,2,20
id,reading_level,chapter_count,paragraph_count
65,1,27,0
66,1,13,13
63,2,3,3
61,1,19,31
69,4,17,16
68,3,12,18
62,1,11,13
67,2,20,19
Binary file modified pmml/step2_train/step2_1_model.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion pmml/step2_train/step2_1_train_model_pkl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
print(os.path.basename(__file__), 'y:\n{}'.format(y))

# Choose features
X = storybooks_dataframe[['id', 'chapter_count']]
X = storybooks_dataframe[['id', 'chapter_count', 'paragraph_count']]
print(os.path.basename(__file__), 'type(X): {}'.format(type(X)))
print(os.path.basename(__file__), 'X:\n{}'.format(X))

Expand Down
19 changes: 12 additions & 7 deletions pmml/step2_train/step2_2_model.pmml
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@
<PMML xmlns="http://www.dmg.org/PMML-4_4" xmlns:data="http://jpmml.org/jpmml-model/InlineTable" version="4.4">
<Header>
<Application name="SkLearn2PMML package" version="0.110.0"/>
<Timestamp>2024-08-17T07:35:44Z</Timestamp>
<Timestamp>2024-08-17T10:17:48Z</Timestamp>
</Header>
<DataDictionary>
<DataField name="reading_level" optype="continuous" dataType="double"/>
<DataField name="id" optype="continuous" dataType="float"/>
<DataField name="chapter_count" optype="continuous" dataType="float"/>
<DataField name="paragraph_count" optype="continuous" dataType="float"/>
</DataDictionary>
<TreeModel functionName="regression" algorithmName="sklearn.tree._classes.DecisionTreeRegressor" missingValueStrategy="nullPrediction" noTrueChildStrategy="returnLastPrediction">
<MiningSchema>
<MiningField name="reading_level" usageType="target"/>
<MiningField name="id"/>
<MiningField name="chapter_count"/>
<MiningField name="paragraph_count"/>
</MiningSchema>
<LocalTransformations>
<DerivedField name="double(id)" optype="continuous" dataType="double">
Expand All @@ -22,20 +24,23 @@
<DerivedField name="double(chapter_count)" optype="continuous" dataType="double">
<FieldRef field="chapter_count"/>
</DerivedField>
<DerivedField name="double(paragraph_count)" optype="continuous" dataType="double">
<FieldRef field="paragraph_count"/>
</DerivedField>
</LocalTransformations>
<Node score="4.0">
<Node score="3.0">
<True/>
<Node score="2.0">
<SimplePredicate field="double(id)" operator="lessOrEqual" value="67.5"/>
<Node score="2.0">
<SimplePredicate field="double(chapter_count)" operator="lessOrEqual" value="7.0"/>
</Node>
<Node score="1.0">
<SimplePredicate field="double(id)" operator="lessOrEqual" value="66.5"/>
<Node score="2.0">
<SimplePredicate field="double(chapter_count)" operator="lessOrEqual" value="7.0"/>
</Node>
</Node>
</Node>
<Node score="3.0">
<SimplePredicate field="double(chapter_count)" operator="lessOrEqual" value="14.5"/>
<Node score="4.0">
<SimplePredicate field="double(paragraph_count)" operator="lessOrEqual" value="17.0"/>
</Node>
</Node>
</TreeModel>
Expand Down
2 changes: 1 addition & 1 deletion pmml/step2_train/step2_2_train_model_pmml.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
print(os.path.basename(__file__), 'y:\n{}'.format(y))

# Choose features
X = storybooks_dataframe[['id', 'chapter_count']]
X = storybooks_dataframe[['id', 'chapter_count', 'paragraph_count']]
print(os.path.basename(__file__), 'type(X): {}'.format(type(X)))
print(os.path.basename(__file__), 'X:\n{}'.format(X))

Expand Down
2 changes: 1 addition & 1 deletion pmml/step3_predict/step3_1_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
print(os.path.basename(__file__), 'storybooks_dataframe:\n{}'.format(storybooks_dataframe))

# Predict
storybook_features = ['id', 'chapter_count']
storybook_features = ['id', 'chapter_count', 'paragraph_count']
predictions_array = reading_level_model.predict(storybooks_dataframe[storybook_features])
print(os.path.basename(__file__), 'predictions_array:\n{}'.format(predictions_array))
print(os.path.basename(__file__), 'type(predictions_array):\n{}'.format(type(predictions_array)))
Expand Down
6 changes: 3 additions & 3 deletions pmml/step3_predict/step3_1_predictions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ reading_level
1.0
1.0
1.0
3.0
3.0
3.0
4.0
4.0
4.0
3.0
18 changes: 9 additions & 9 deletions pmml/step3_predict/step3_1_storybooks_test.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
id,reading_level,chapter_count
50,1,10
55,2,11
52,3,10
57,4,13
89,1,2
88,2,4
82,3,8
87,4,16
id,reading_level,chapter_count,paragraph_count
50,1,10,10
55,2,11,22
52,3,10,10
57,4,13,26
89,1,2,2
88,2,4,8
82,3,8,8
87,4,16,32
2 changes: 1 addition & 1 deletion pmml/step3_predict/step3_2_mean_absolute_error.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.125
1.625
6 changes: 3 additions & 3 deletions pmml/step3_predict/step3_2_predictions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ predicted_reading_level
1.0
1.0
1.0
3.0
3.0
3.0
4.0
4.0
4.0
3.0
2 changes: 1 addition & 1 deletion pmml/step3_predict/step3_2_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
print(os.path.basename(__file__), 'storybooks_dataframe:\n{}'.format(storybooks_dataframe))

# Predict
storybook_features = ['id', 'chapter_count']
storybook_features = ['id', 'chapter_count', 'paragraph_count']
predictions = reading_level_model.predict(storybooks_dataframe[storybook_features])
print(os.path.basename(__file__), 'predictions:\n{}'.format(predictions))
print(os.path.basename(__file__), 'type(predictions):\n{}'.format(type(predictions)))
Expand Down
18 changes: 14 additions & 4 deletions pmml/utils/chapters_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
from os.path import basename

'''json
[
Expand All @@ -8,7 +8,7 @@
},
"sortOrder": 0,
"id": 99,
"storyBookParagraphs": [
"storyBookParagraphs": [
{
"originalText": "Earth is the planet that we live on. Currently no other planet is known to contain life.",
"sortOrder": 0,
Expand Down Expand Up @@ -47,7 +47,17 @@
]
'''
def get_chapter_count(chapters_json):
print(os.path.basename(__file__), "get_chapter_count")
print(basename(__file__), 'get_chapter_count')
chapter_count = len(chapters_json)
print(os.path.basename(__file__), "chapter_count: {}".format(chapter_count))
print(basename(__file__), f'chapter_count: {chapter_count}')
return chapter_count

def get_paragraph_count(chapters_json):
print(basename(__file__), 'get_paragraph_count')
paragraph_count = 0
for chapter in chapters_json:
print(basename(__file__), f'chapter["sortOrder"]: {chapter["sortOrder"]}')
for paragraph in chapter["storyBookParagraphs"]:
print(basename(__file__), f'paragraph: {paragraph}')
paragraph_count += 1
return paragraph_count
27 changes: 27 additions & 0 deletions pmml/utils/test_chapters_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,30 @@ def test_get_chapter_count_when_multiple_paragraphs():
chapter_count = chapters_utils.get_chapter_count(chapters_json)

assert chapter_count == 12

def test_get_paragraph_count_when_no_paragraphs():
# Load JSON
chapters_json = [{"image":{"id":467},"sortOrder":0,"id":120,"storyBookParagraphs":[]},{"image":{"id":468},"sortOrder":1,"id":121,"storyBookParagraphs":[]},{"image":{"id":469},"sortOrder":2,"id":122,"storyBookParagraphs":[]},{"image":{"id":470},"sortOrder":3,"id":123,"storyBookParagraphs":[]},{"image":{"id":471},"sortOrder":4,"id":124,"storyBookParagraphs":[]},{"image":{"id":472},"sortOrder":5,"id":125,"storyBookParagraphs":[]},{"image":{"id":473},"sortOrder":6,"id":126,"storyBookParagraphs":[]},{"image":{"id":474},"sortOrder":7,"id":127,"storyBookParagraphs":[]},{"image":{"id":475},"sortOrder":8,"id":128,"storyBookParagraphs":[]},{"image":{"id":476},"sortOrder":9,"id":129,"storyBookParagraphs":[]},{"image":{"id":477},"sortOrder":10,"id":130,"storyBookParagraphs":[]},{"image":{"id":478},"sortOrder":11,"id":131,"storyBookParagraphs":[]},{"image":{"id":479},"sortOrder":12,"id":132,"storyBookParagraphs":[]},{"image":{"id":480},"sortOrder":13,"id":133,"storyBookParagraphs":[]},{"image":{"id":481},"sortOrder":14,"id":134,"storyBookParagraphs":[]},{"image":{"id":482},"sortOrder":15,"id":135,"storyBookParagraphs":[]},{"image":{"id":483},"sortOrder":16,"id":136,"storyBookParagraphs":[]},{"image":{"id":484},"sortOrder":17,"id":137,"storyBookParagraphs":[]},{"image":{"id":485},"sortOrder":18,"id":138,"storyBookParagraphs":[]},{"image":{"id":486},"sortOrder":19,"id":139,"storyBookParagraphs":[]},{"image":{"id":487},"sortOrder":20,"id":140,"storyBookParagraphs":[]},{"image":{"id":488},"sortOrder":21,"id":141,"storyBookParagraphs":[]},{"image":{"id":489},"sortOrder":22,"id":142,"storyBookParagraphs":[]},{"image":{"id":490},"sortOrder":23,"id":143,"storyBookParagraphs":[]},{"image":{"id":491},"sortOrder":24,"id":144,"storyBookParagraphs":[]},{"image":{"id":492},"sortOrder":25,"id":145,"storyBookParagraphs":[]},{"image":{"id":493},"sortOrder":26,"id":146,"storyBookParagraphs":[]}]

# Extract the number of paragraphs
paragraph_count = chapters_utils.get_paragraph_count(chapters_json)

assert paragraph_count == 0

def test_get_paragraph_count_when_single_paragraphs():
# Load JSON
chapters_json = [{"image":{"id":447},"sortOrder":0,"id":99,"storyBookParagraphs":[{"originalText":"Earth is the planet that we live on. Currently no other planet is known to contain life.","sortOrder":0,"id":142}]},{"image":{"id":448},"sortOrder":1,"id":100,"storyBookParagraphs":[{"originalText":"The Earth is in danger because of global warming. Global warming is caused by too much carbon dioxide in the atmosphere. Carbon dioxide is a gas which traps heat in the Earth. Without it Earth\'s heat would flow out and Earth would freeze.","sortOrder":0,"id":143}]},{"image":{"id":449},"sortOrder":2,"id":101,"storyBookParagraphs":[{"originalText":"The cars we drive create lots of carbon dioxide. We should walk more or ride a bicycle.","sortOrder":0,"id":144}]}]

# Extract the number of paragraphs
paragraph_count = chapters_utils.get_paragraph_count(chapters_json)

assert paragraph_count == 3

def test_get_paragraph_count_when_multiple_paragraphs():
# Load JSON
chapters_json = [{"image":{"id":529},"sortOrder":0,"id":180,"storyBookParagraphs":[{"originalText":"One day, in a sleepy village with no electricity, the Electricity Department decided to put up electricity poles.","sortOrder":0,"id":219},{"originalText":"The villagers were excited.","sortOrder":1,"id":220}]},{"image":{"id":530},"sortOrder":1,"id":181,"storyBookParagraphs":[{"originalText":"Every day, a big pit was dug out for each pole.","sortOrder":0,"id":221},{"originalText":"The entire village would gather and watch.","sortOrder":1,"id":222}]},{"image":{"id":531},"sortOrder":2,"id":182,"storyBookParagraphs":[{"originalText":"Ropes were used to pull up the poles. The villagers helped lift the poles into the pits.","sortOrder":0,"id":223},{"originalText":"Even the kids pulled the ropes with all their might. \"HAYYIIISHHHHAAAAAA!\"","sortOrder":1,"id":224}]},{"image":{"id":532},"sortOrder":3,"id":183,"storyBookParagraphs":[{"originalText":"Once the poles were up, the Electricity Department team vanished.","sortOrder":0,"id":225},{"originalText":"Every day, the villagers would wait for them to come back and finish their work. All they could talk about was how excited they were to get electricity.","sortOrder":1,"id":226}]},{"image":{"id":533},"sortOrder":4,"id":184,"storyBookParagraphs":[]},{"image":{"id":534},"sortOrder":5,"id":185,"storyBookParagraphs":[{"originalText":"The villagers began climbing up the poles. The poles gave them a perfect view of their surroundings and far away villages.","sortOrder":0,"id":227},{"originalText":"Days passed, with no sign of the Electricity Department team.","sortOrder":1,"id":229}]},{"image":{"id":535},"sortOrder":6,"id":186,"storyBookParagraphs":[{"originalText":"One evening, a little child took a lantern from her house.","sortOrder":0,"id":231},{"originalText":"She climbed up one of the poles and hung the lantern on top.","sortOrder":1,"id":232}]},{"image":{"id":536},"sortOrder":7,"id":187,"storyBookParagraphs":[{"originalText":"Everyone noticed the lovely bright lantern on top of the pole.","sortOrder":0,"id":233}]},{"image":{"id":537},"sortOrder":8,"id":188,"storyBookParagraphs":[]},{"image":{"id":538},"sortOrder":9,"id":189,"storyBookParagraphs":[{"originalText":"The next evening, the villagers hung lanterns from all five poles.","sortOrder":0,"id":234},{"originalText":"Every day, lanterns were taken down, refueled, and hung back up again.","sortOrder":1,"id":235},{"originalText":"The villagers made a game of it and had lots of fun.","sortOrder":2,"id":236}]},{"image":{"id":539},"sortOrder":10,"id":190,"storyBookParagraphs":[{"originalText":"Then, even without electricity, the nearby villages could spot this little village from afar at night.","sortOrder":0,"id":237},{"originalText":"They began calling it \"The Village of Five Poles.\"","sortOrder":1,"id":240}]},{"image":{"id":540},"sortOrder":11,"id":191,"storyBookParagraphs":[]}]

# Extract the number of paragraphs
paragraph_count = chapters_utils.get_paragraph_count(chapters_json)

assert paragraph_count == 18

0 comments on commit ea0e471

Please sign in to comment.