From ea0e471284dec94043290482f6b9aaae5af4d4b7 Mon Sep 17 00:00:00 2001 From: jo-elimu <1451036+jo-elimu@users.noreply.github.com> Date: Sat, 17 Aug 2024 17:19:44 +0700 Subject: [PATCH] add paragraph count --- pmml/step1_prepare/step1_2_preprocess_data.py | 7 ++++- pmml/step1_prepare/step1_2_storybooks.csv | 18 ++++++------ pmml/step2_train/step2_1_model.pkl | Bin 1787 -> 1805 bytes pmml/step2_train/step2_1_train_model_pkl.py | 2 +- pmml/step2_train/step2_2_model.pmml | 19 +++++++----- pmml/step2_train/step2_2_train_model_pmml.py | 2 +- pmml/step3_predict/step3_1_predict.py | 2 +- pmml/step3_predict/step3_1_predictions.csv | 6 ++-- .../step3_predict/step3_1_storybooks_test.csv | 18 ++++++------ .../step3_2_mean_absolute_error.txt | 2 +- pmml/step3_predict/step3_2_predictions.csv | 6 ++-- pmml/step3_predict/step3_2_validate.py | 2 +- pmml/utils/chapters_utils.py | 18 +++++++++--- pmml/utils/test_chapters_utils.py | 27 ++++++++++++++++++ 14 files changed, 88 insertions(+), 41 deletions(-) diff --git a/pmml/step1_prepare/step1_2_preprocess_data.py b/pmml/step1_prepare/step1_2_preprocess_data.py index 3bd62b6..58e5a99 100644 --- a/pmml/step1_prepare/step1_2_preprocess_data.py +++ b/pmml/step1_prepare/step1_2_preprocess_data.py @@ -14,16 +14,21 @@ # For each row in the DataFrame, extract information from the JSON string stored in the `chapters` column storybooks_dataframe['chapter_count'] = 0 +storybooks_dataframe['paragraph_count'] = 0 for index in storybooks_dataframe.index: print(basename(__file__), f'index: {index}') chapters = storybooks_dataframe.loc[index]['chapters'] chapters_json = json.loads(chapters) + chapter_count = chapters_utils.get_chapter_count(chapters_json) storybooks_dataframe.loc[index, 'chapter_count'] = chapter_count + + paragraph_count = chapters_utils.get_paragraph_count(chapters_json) + storybooks_dataframe.loc[index, 'paragraph_count'] = paragraph_count print(basename(__file__), f'storybooks_dataframe (after extracting data from `chapters` column): \n{storybooks_dataframe}') # Drop unnecessary columns -storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count']] +storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count', 'paragraph_count']] print(basename(__file__), f'storybooks_dataframe (after dropping unnecessary columns): \n{storybooks_dataframe}') # Drop missing values diff --git a/pmml/step1_prepare/step1_2_storybooks.csv b/pmml/step1_prepare/step1_2_storybooks.csv index de02d7f..c0110de 100644 --- a/pmml/step1_prepare/step1_2_storybooks.csv +++ b/pmml/step1_prepare/step1_2_storybooks.csv @@ -1,9 +1,9 @@ -id,reading_level,chapter_count -65,1,27 -66,1,13 -63,2,3 -61,1,19 -69,4,17 -68,3,12 -62,1,11 -67,2,20 +id,reading_level,chapter_count,paragraph_count +65,1,27,0 +66,1,13,13 +63,2,3,3 +61,1,19,31 +69,4,17,16 +68,3,12,18 +62,1,11,13 +67,2,20,19 diff --git a/pmml/step2_train/step2_1_model.pkl b/pmml/step2_train/step2_1_model.pkl index 3b671213db90a38b2336190cd22e68b4524bf5a8..78cb17d8ddd06c18e3821921deae1df0ee693cff 100644 GIT binary patch delta 335 zcmey(+snt=z&e$QeIx5;Mn>kzI~l#Xd-w|yixSg|5(_dW|7DZ~ib^vn07)Arbs(9- z)X!)-`3qCf84=zJ zKzTo)It?K059AvF>43?GZ07a0B|ybJY-Nc#rK!bJGE6n5d7xPO&ps3kzSf~`#8NBy&2o5WPoieP3jSdFV4`_0SRz&iB<+eX&SjEqc^cQSfU7Gn|vvUHggfTSOjI*=@5>Sr{W%*GtV$UONY zvnZp? z0(EKtX+I#}07&~!c4jjduqgp*?_n!T%qdMRo|0iQxrq`AjKScuJ%sxKN;5b> d_&iXW14;`xOaU6r>CNcP*ghr0X7Wom6##YRQ4atB diff --git a/pmml/step2_train/step2_1_train_model_pkl.py b/pmml/step2_train/step2_1_train_model_pkl.py index 75be7ed..0e0a293 100644 --- a/pmml/step2_train/step2_1_train_model_pkl.py +++ b/pmml/step2_train/step2_1_train_model_pkl.py @@ -16,7 +16,7 @@ print(os.path.basename(__file__), 'y:\n{}'.format(y)) # Choose features -X = storybooks_dataframe[['id', 'chapter_count']] +X = storybooks_dataframe[['id', 'chapter_count', 'paragraph_count']] print(os.path.basename(__file__), 'type(X): {}'.format(type(X))) print(os.path.basename(__file__), 'X:\n{}'.format(X)) diff --git a/pmml/step2_train/step2_2_model.pmml b/pmml/step2_train/step2_2_model.pmml index 13342d1..97a3316 100644 --- a/pmml/step2_train/step2_2_model.pmml +++ b/pmml/step2_train/step2_2_model.pmml @@ -2,18 +2,20 @@
- 2024-08-17T07:35:44Z + 2024-08-17T10:17:48Z
+ + @@ -22,20 +24,23 @@ + + + - + - - - + + + - - + + diff --git a/pmml/step2_train/step2_2_train_model_pmml.py b/pmml/step2_train/step2_2_train_model_pmml.py index 108e93e..ab75935 100644 --- a/pmml/step2_train/step2_2_train_model_pmml.py +++ b/pmml/step2_train/step2_2_train_model_pmml.py @@ -16,7 +16,7 @@ print(os.path.basename(__file__), 'y:\n{}'.format(y)) # Choose features -X = storybooks_dataframe[['id', 'chapter_count']] +X = storybooks_dataframe[['id', 'chapter_count', 'paragraph_count']] print(os.path.basename(__file__), 'type(X): {}'.format(type(X))) print(os.path.basename(__file__), 'X:\n{}'.format(X)) diff --git a/pmml/step3_predict/step3_1_predict.py b/pmml/step3_predict/step3_1_predict.py index ccb0263..5e11256 100644 --- a/pmml/step3_predict/step3_1_predict.py +++ b/pmml/step3_predict/step3_1_predict.py @@ -16,7 +16,7 @@ print(os.path.basename(__file__), 'storybooks_dataframe:\n{}'.format(storybooks_dataframe)) # Predict -storybook_features = ['id', 'chapter_count'] +storybook_features = ['id', 'chapter_count', 'paragraph_count'] predictions_array = reading_level_model.predict(storybooks_dataframe[storybook_features]) print(os.path.basename(__file__), 'predictions_array:\n{}'.format(predictions_array)) print(os.path.basename(__file__), 'type(predictions_array):\n{}'.format(type(predictions_array))) diff --git a/pmml/step3_predict/step3_1_predictions.csv b/pmml/step3_predict/step3_1_predictions.csv index 5a1c588..c4afa56 100644 --- a/pmml/step3_predict/step3_1_predictions.csv +++ b/pmml/step3_predict/step3_1_predictions.csv @@ -3,7 +3,7 @@ reading_level 1.0 1.0 1.0 -3.0 -3.0 -3.0 4.0 +4.0 +4.0 +3.0 diff --git a/pmml/step3_predict/step3_1_storybooks_test.csv b/pmml/step3_predict/step3_1_storybooks_test.csv index b62c66a..a177f9f 100644 --- a/pmml/step3_predict/step3_1_storybooks_test.csv +++ b/pmml/step3_predict/step3_1_storybooks_test.csv @@ -1,9 +1,9 @@ -id,reading_level,chapter_count -50,1,10 -55,2,11 -52,3,10 -57,4,13 -89,1,2 -88,2,4 -82,3,8 -87,4,16 +id,reading_level,chapter_count,paragraph_count +50,1,10,10 +55,2,11,22 +52,3,10,10 +57,4,13,26 +89,1,2,2 +88,2,4,8 +82,3,8,8 +87,4,16,32 diff --git a/pmml/step3_predict/step3_2_mean_absolute_error.txt b/pmml/step3_predict/step3_2_mean_absolute_error.txt index 9f5aceb..ebac217 100644 --- a/pmml/step3_predict/step3_2_mean_absolute_error.txt +++ b/pmml/step3_predict/step3_2_mean_absolute_error.txt @@ -1 +1 @@ -1.125 \ No newline at end of file +1.625 \ No newline at end of file diff --git a/pmml/step3_predict/step3_2_predictions.csv b/pmml/step3_predict/step3_2_predictions.csv index 2d01e13..645369e 100644 --- a/pmml/step3_predict/step3_2_predictions.csv +++ b/pmml/step3_predict/step3_2_predictions.csv @@ -3,7 +3,7 @@ predicted_reading_level 1.0 1.0 1.0 -3.0 -3.0 -3.0 4.0 +4.0 +4.0 +3.0 diff --git a/pmml/step3_predict/step3_2_validate.py b/pmml/step3_predict/step3_2_validate.py index dd73b7e..d6c0ead 100644 --- a/pmml/step3_predict/step3_2_validate.py +++ b/pmml/step3_predict/step3_2_validate.py @@ -16,7 +16,7 @@ print(os.path.basename(__file__), 'storybooks_dataframe:\n{}'.format(storybooks_dataframe)) # Predict -storybook_features = ['id', 'chapter_count'] +storybook_features = ['id', 'chapter_count', 'paragraph_count'] predictions = reading_level_model.predict(storybooks_dataframe[storybook_features]) print(os.path.basename(__file__), 'predictions:\n{}'.format(predictions)) print(os.path.basename(__file__), 'type(predictions):\n{}'.format(type(predictions))) diff --git a/pmml/utils/chapters_utils.py b/pmml/utils/chapters_utils.py index c6124da..4bf09a5 100644 --- a/pmml/utils/chapters_utils.py +++ b/pmml/utils/chapters_utils.py @@ -1,4 +1,4 @@ -import os +from os.path import basename '''json [ @@ -8,7 +8,7 @@ }, "sortOrder": 0, "id": 99, - "storyBookParagraphs": [ + "storyBookParagraphs": [ { "originalText": "Earth is the planet that we live on. Currently no other planet is known to contain life.", "sortOrder": 0, @@ -47,7 +47,17 @@ ] ''' def get_chapter_count(chapters_json): - print(os.path.basename(__file__), "get_chapter_count") + print(basename(__file__), 'get_chapter_count') chapter_count = len(chapters_json) - print(os.path.basename(__file__), "chapter_count: {}".format(chapter_count)) + print(basename(__file__), f'chapter_count: {chapter_count}') return chapter_count + +def get_paragraph_count(chapters_json): + print(basename(__file__), 'get_paragraph_count') + paragraph_count = 0 + for chapter in chapters_json: + print(basename(__file__), f'chapter["sortOrder"]: {chapter["sortOrder"]}') + for paragraph in chapter["storyBookParagraphs"]: + print(basename(__file__), f'paragraph: {paragraph}') + paragraph_count += 1 + return paragraph_count diff --git a/pmml/utils/test_chapters_utils.py b/pmml/utils/test_chapters_utils.py index 73e7ebd..0063dca 100644 --- a/pmml/utils/test_chapters_utils.py +++ b/pmml/utils/test_chapters_utils.py @@ -26,3 +26,30 @@ def test_get_chapter_count_when_multiple_paragraphs(): chapter_count = chapters_utils.get_chapter_count(chapters_json) assert chapter_count == 12 + +def test_get_paragraph_count_when_no_paragraphs(): + # Load JSON + chapters_json = [{"image":{"id":467},"sortOrder":0,"id":120,"storyBookParagraphs":[]},{"image":{"id":468},"sortOrder":1,"id":121,"storyBookParagraphs":[]},{"image":{"id":469},"sortOrder":2,"id":122,"storyBookParagraphs":[]},{"image":{"id":470},"sortOrder":3,"id":123,"storyBookParagraphs":[]},{"image":{"id":471},"sortOrder":4,"id":124,"storyBookParagraphs":[]},{"image":{"id":472},"sortOrder":5,"id":125,"storyBookParagraphs":[]},{"image":{"id":473},"sortOrder":6,"id":126,"storyBookParagraphs":[]},{"image":{"id":474},"sortOrder":7,"id":127,"storyBookParagraphs":[]},{"image":{"id":475},"sortOrder":8,"id":128,"storyBookParagraphs":[]},{"image":{"id":476},"sortOrder":9,"id":129,"storyBookParagraphs":[]},{"image":{"id":477},"sortOrder":10,"id":130,"storyBookParagraphs":[]},{"image":{"id":478},"sortOrder":11,"id":131,"storyBookParagraphs":[]},{"image":{"id":479},"sortOrder":12,"id":132,"storyBookParagraphs":[]},{"image":{"id":480},"sortOrder":13,"id":133,"storyBookParagraphs":[]},{"image":{"id":481},"sortOrder":14,"id":134,"storyBookParagraphs":[]},{"image":{"id":482},"sortOrder":15,"id":135,"storyBookParagraphs":[]},{"image":{"id":483},"sortOrder":16,"id":136,"storyBookParagraphs":[]},{"image":{"id":484},"sortOrder":17,"id":137,"storyBookParagraphs":[]},{"image":{"id":485},"sortOrder":18,"id":138,"storyBookParagraphs":[]},{"image":{"id":486},"sortOrder":19,"id":139,"storyBookParagraphs":[]},{"image":{"id":487},"sortOrder":20,"id":140,"storyBookParagraphs":[]},{"image":{"id":488},"sortOrder":21,"id":141,"storyBookParagraphs":[]},{"image":{"id":489},"sortOrder":22,"id":142,"storyBookParagraphs":[]},{"image":{"id":490},"sortOrder":23,"id":143,"storyBookParagraphs":[]},{"image":{"id":491},"sortOrder":24,"id":144,"storyBookParagraphs":[]},{"image":{"id":492},"sortOrder":25,"id":145,"storyBookParagraphs":[]},{"image":{"id":493},"sortOrder":26,"id":146,"storyBookParagraphs":[]}] + + # Extract the number of paragraphs + paragraph_count = chapters_utils.get_paragraph_count(chapters_json) + + assert paragraph_count == 0 + +def test_get_paragraph_count_when_single_paragraphs(): + # Load JSON + chapters_json = [{"image":{"id":447},"sortOrder":0,"id":99,"storyBookParagraphs":[{"originalText":"Earth is the planet that we live on. Currently no other planet is known to contain life.","sortOrder":0,"id":142}]},{"image":{"id":448},"sortOrder":1,"id":100,"storyBookParagraphs":[{"originalText":"The Earth is in danger because of global warming. Global warming is caused by too much carbon dioxide in the atmosphere. Carbon dioxide is a gas which traps heat in the Earth. Without it Earth\'s heat would flow out and Earth would freeze.","sortOrder":0,"id":143}]},{"image":{"id":449},"sortOrder":2,"id":101,"storyBookParagraphs":[{"originalText":"The cars we drive create lots of carbon dioxide. We should walk more or ride a bicycle.","sortOrder":0,"id":144}]}] + + # Extract the number of paragraphs + paragraph_count = chapters_utils.get_paragraph_count(chapters_json) + + assert paragraph_count == 3 + +def test_get_paragraph_count_when_multiple_paragraphs(): + # Load JSON + chapters_json = [{"image":{"id":529},"sortOrder":0,"id":180,"storyBookParagraphs":[{"originalText":"One day, in a sleepy village with no electricity, the Electricity Department decided to put up electricity poles.","sortOrder":0,"id":219},{"originalText":"The villagers were excited.","sortOrder":1,"id":220}]},{"image":{"id":530},"sortOrder":1,"id":181,"storyBookParagraphs":[{"originalText":"Every day, a big pit was dug out for each pole.","sortOrder":0,"id":221},{"originalText":"The entire village would gather and watch.","sortOrder":1,"id":222}]},{"image":{"id":531},"sortOrder":2,"id":182,"storyBookParagraphs":[{"originalText":"Ropes were used to pull up the poles. The villagers helped lift the poles into the pits.","sortOrder":0,"id":223},{"originalText":"Even the kids pulled the ropes with all their might. \"HAYYIIISHHHHAAAAAA!\"","sortOrder":1,"id":224}]},{"image":{"id":532},"sortOrder":3,"id":183,"storyBookParagraphs":[{"originalText":"Once the poles were up, the Electricity Department team vanished.","sortOrder":0,"id":225},{"originalText":"Every day, the villagers would wait for them to come back and finish their work. All they could talk about was how excited they were to get electricity.","sortOrder":1,"id":226}]},{"image":{"id":533},"sortOrder":4,"id":184,"storyBookParagraphs":[]},{"image":{"id":534},"sortOrder":5,"id":185,"storyBookParagraphs":[{"originalText":"The villagers began climbing up the poles. The poles gave them a perfect view of their surroundings and far away villages.","sortOrder":0,"id":227},{"originalText":"Days passed, with no sign of the Electricity Department team.","sortOrder":1,"id":229}]},{"image":{"id":535},"sortOrder":6,"id":186,"storyBookParagraphs":[{"originalText":"One evening, a little child took a lantern from her house.","sortOrder":0,"id":231},{"originalText":"She climbed up one of the poles and hung the lantern on top.","sortOrder":1,"id":232}]},{"image":{"id":536},"sortOrder":7,"id":187,"storyBookParagraphs":[{"originalText":"Everyone noticed the lovely bright lantern on top of the pole.","sortOrder":0,"id":233}]},{"image":{"id":537},"sortOrder":8,"id":188,"storyBookParagraphs":[]},{"image":{"id":538},"sortOrder":9,"id":189,"storyBookParagraphs":[{"originalText":"The next evening, the villagers hung lanterns from all five poles.","sortOrder":0,"id":234},{"originalText":"Every day, lanterns were taken down, refueled, and hung back up again.","sortOrder":1,"id":235},{"originalText":"The villagers made a game of it and had lots of fun.","sortOrder":2,"id":236}]},{"image":{"id":539},"sortOrder":10,"id":190,"storyBookParagraphs":[{"originalText":"Then, even without electricity, the nearby villages could spot this little village from afar at night.","sortOrder":0,"id":237},{"originalText":"They began calling it \"The Village of Five Poles.\"","sortOrder":1,"id":240}]},{"image":{"id":540},"sortOrder":11,"id":191,"storyBookParagraphs":[]}] + + # Extract the number of paragraphs + paragraph_count = chapters_utils.get_paragraph_count(chapters_json) + + assert paragraph_count == 18