-
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from elimu-ai/experimentation
Export PMML
- Loading branch information
Showing
23 changed files
with
738 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# This workflow will install Python dependencies, run tests and lint with a single version of Python | ||
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python | ||
|
||
name: Python application (PMML) | ||
|
||
on: | ||
push: | ||
branches: [ "main" ] | ||
pull_request: | ||
branches: [ "main" ] | ||
|
||
permissions: | ||
contents: read | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python-version: ["3.9", "3.10", "3.11"] | ||
defaults: | ||
run: | ||
working-directory: pmml | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v3 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install flake8 pytest | ||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi | ||
- name: Lint with flake8 | ||
run: | | ||
# stop the build if there are Python syntax errors or undefined names | ||
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics | ||
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide | ||
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics | ||
- name: Test with pytest | ||
run: | | ||
pytest | ||
- name: Run All Steps (1-3) | ||
run: | | ||
python run_all_steps.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# ML: Storybook Reading Level Predictor (PMML) | ||
|
||
## Code Usage | ||
|
||
```python | ||
pip install -r requirements.txt | ||
``` | ||
|
||
```python | ||
pytest | ||
``` | ||
|
||
```python | ||
python run_all_steps.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
numpy==1.26.4 | ||
pandas==2.2.2 | ||
pypmml==0.9.17 | ||
scikit-learn==1.5.0 | ||
sklearn2pmml==0.110.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import os | ||
|
||
print('\n*** Step 1. Prepare Data ***') | ||
os.chdir('step1_prepare') | ||
print(os.path.basename(__file__), 'os.getcwd(): {}'.format(os.getcwd())) | ||
import step1_prepare.step1_1_download_data | ||
import step1_prepare.step1_2_preprocess_data | ||
import step1_prepare.step1_3_split_data | ||
|
||
print('\n*** Step 2. Train Model ***') | ||
os.chdir('../step2_train') | ||
print(os.path.basename(__file__), 'os.getcwd(): {}'.format(os.getcwd())) | ||
import step2_train.step2_1_train_model_pkl | ||
import step2_train.step2_2_train_model_pmml | ||
|
||
print('\n*** Step 3. Make Prediction ***') | ||
os.chdir('../step3_predict') | ||
print(os.path.basename(__file__), 'os.getcwd(): {}'.format(os.getcwd())) | ||
import step3_predict.step3_1_predict | ||
import step3_predict.step3_2_validate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import os | ||
import pandas | ||
|
||
# Read the storybooks CSV into a DataFrame | ||
storybooks_csv_url = ('https://raw.githubusercontent.com/elimu-ai/webapp/main/src/main/resources/db/content_PROD' | ||
'/hin/storybooks.csv') | ||
print(os.path.basename(__file__), 'storybooks_csv_url: {}'.format(storybooks_csv_url)) | ||
storybooks_dataframe = pandas.read_csv(storybooks_csv_url) | ||
print(os.path.basename(__file__), 'storybooks_dataframe:\n{}'.format(storybooks_dataframe)) | ||
|
||
# Write the DataFrame to a CSV file | ||
storybooks_dataframe.to_csv('step1_1_storybooks.csv', index=False) |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import json | ||
from os.path import basename, dirname, abspath | ||
import pandas | ||
import sys | ||
|
||
parent_dir = dirname(dirname(abspath(__file__))) | ||
sys.path.append(parent_dir) | ||
from utils import chapters_utils | ||
|
||
# Read the original storybooks CSV into a DataFrame | ||
storybooks_csv_path = 'step1_1_storybooks.csv' | ||
print(basename(__file__), f'storybooks_csv_path: {storybooks_csv_path}') | ||
storybooks_dataframe = pandas.read_csv(storybooks_csv_path) | ||
|
||
# For each row in the DataFrame, extract information from the JSON string stored in the `chapters` column | ||
storybooks_dataframe['chapter_count'] = 0 | ||
storybooks_dataframe['paragraph_count'] = 0 | ||
storybooks_dataframe['word_count'] = 0 | ||
for index in storybooks_dataframe.index: | ||
print(basename(__file__), f'index: {index}') | ||
chapters = storybooks_dataframe.loc[index]['chapters'] | ||
chapters_json = json.loads(chapters) | ||
|
||
chapter_count = chapters_utils.get_chapter_count(chapters_json) | ||
storybooks_dataframe.loc[index, 'chapter_count'] = chapter_count | ||
|
||
paragraph_count = chapters_utils.get_paragraph_count(chapters_json) | ||
storybooks_dataframe.loc[index, 'paragraph_count'] = paragraph_count | ||
|
||
word_count = chapters_utils.get_word_count(chapters_json) | ||
storybooks_dataframe.loc[index, 'word_count'] = word_count | ||
print(basename(__file__), f'storybooks_dataframe (after extracting data from `chapters` column): \n{storybooks_dataframe}') | ||
|
||
# Drop unnecessary columns | ||
storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count', 'paragraph_count', 'word_count']] | ||
print(basename(__file__), f'storybooks_dataframe (after dropping unnecessary columns): \n{storybooks_dataframe}') | ||
|
||
# Drop missing values | ||
storybooks_dataframe = storybooks_dataframe.dropna() | ||
print(basename(__file__), f'storybooks_dataframe (after dropping missing values): \n{storybooks_dataframe}') | ||
|
||
# Extract number from reading level (e.g. 'LEVEL1' --> '1') | ||
storybooks_dataframe['reading_level'] = storybooks_dataframe['reading_level'].str.extract('(\\d+)') | ||
print(basename(__file__), f'storybooks_dataframe (after converting texts to numbers): \n{storybooks_dataframe}') | ||
|
||
# Write the DataFrame to a CSV file | ||
storybooks_dataframe.to_csv('step1_2_storybooks.csv', index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
id,reading_level,chapter_count,paragraph_count,word_count | ||
37,1,11,20,26 | ||
1,1,7,17,167 | ||
36,1,11,29,138 | ||
59,2,16,29,129 | ||
25,3,19,54,1596 | ||
29,1,11,13,110 | ||
63,1,15,13,101 | ||
2,1,7,11,123 | ||
15,3,23,47,1291 | ||
6,1,7,7,78 | ||
53,2,15,62,486 | ||
66,2,15,44,442 | ||
56,2,11,37,398 | ||
4,1,12,25,117 | ||
42,1,15,37,264 | ||
61,2,17,29,178 | ||
50,2,18,48,811 | ||
32,1,10,25,209 | ||
11,1,12,31,194 | ||
26,2,7,11,393 | ||
51,2,19,48,623 | ||
7,1,11,13,144 | ||
64,2,4,25,85 | ||
55,2,18,52,461 | ||
34,1,11,19,148 | ||
47,1,16,18,117 | ||
24,3,14,20,731 | ||
27,2,11,23,211 | ||
46,1,5,5,81 | ||
65,2,12,30,426 | ||
60,2,11,42,427 | ||
17,3,9,10,374 | ||
19,3,15,39,1342 | ||
39,1,13,15,177 | ||
5,1,11,21,116 | ||
28,1,10,17,95 | ||
20,3,13,22,672 | ||
30,1,13,28,201 | ||
22,4,23,75,2057 | ||
57,2,16,22,133 | ||
33,1,14,13,23 | ||
8,1,11,12,150 | ||
9,1,11,11,107 | ||
16,1,11,13,120 | ||
41,4,10,30,555 | ||
10,1,6,18,101 | ||
58,2,11,16,131 | ||
49,1,9,23,258 | ||
52,2,20,41,559 | ||
18,3,17,38,564 | ||
40,1,4,16,89 | ||
14,2,14,40,488 | ||
23,4,16,55,1507 | ||
38,1,14,20,140 | ||
54,2,11,28,211 | ||
21,3,12,26,965 | ||
31,1,4,4,20 | ||
48,1,8,15,176 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from os.path import basename | ||
import pandas | ||
from sklearn.model_selection import train_test_split | ||
|
||
# Read the preprocessed storybooks CSV into a DataFrame | ||
storybooks_csv_path = 'step1_2_storybooks.csv' | ||
print(basename(__file__), f'storybooks_csv_path: {storybooks_csv_path}') | ||
storybooks_dataframe = pandas.read_csv(storybooks_csv_path) | ||
|
||
# Split the data into training and validation data | ||
X = storybooks_dataframe[['id', 'chapter_count', 'paragraph_count', 'word_count']] | ||
y = storybooks_dataframe[['reading_level']] | ||
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0) | ||
print(basename(__file__), f'train_X: \n{train_X}') | ||
print(basename(__file__), f'train_y: \n{train_y}') | ||
print(basename(__file__), f'test_X: \n{test_X}') | ||
print(basename(__file__), f'test_y: \n{test_y}') | ||
|
||
# Write the DataFrames to CSV files | ||
storybooks_dataframe_train = pandas.concat([train_X, train_y], axis=1) | ||
storybooks_dataframe_train.to_csv('step1_3_storybooks_train.csv', index=False) | ||
storybooks_dataframe_test = pandas.concat([test_X, test_y], axis=1) | ||
storybooks_dataframe_test.to_csv('step1_3_storybooks_test.csv', index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
id,chapter_count,paragraph_count,word_count,reading_level | ||
28,10,17,95,1 | ||
5,11,21,116,1 | ||
9,11,11,107,1 | ||
27,11,23,211,2 | ||
66,15,44,442,2 | ||
36,11,29,138,1 | ||
39,13,15,177,1 | ||
10,6,18,101,1 | ||
64,4,25,85,2 | ||
52,20,41,559,2 | ||
25,19,54,1596,3 | ||
53,15,62,486,2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
id,chapter_count,paragraph_count,word_count,reading_level | ||
65,12,30,426,2 | ||
33,14,13,23,1 | ||
19,15,39,1342,3 | ||
8,11,12,150,1 | ||
30,13,28,201,1 | ||
2,7,11,123,1 | ||
42,15,37,264,1 | ||
17,9,10,374,3 | ||
46,5,5,81,1 | ||
31,4,4,20,1 | ||
23,16,55,1507,4 | ||
11,12,31,194,1 | ||
54,11,28,211,2 | ||
24,14,20,731,3 | ||
61,17,29,178,2 | ||
29,11,13,110,1 | ||
60,11,42,427,2 | ||
50,18,48,811,2 | ||
18,17,38,564,3 | ||
51,19,48,623,2 | ||
40,4,16,89,1 | ||
15,23,47,1291,3 | ||
4,12,25,117,1 | ||
47,16,18,117,1 | ||
32,10,25,209,1 | ||
16,11,13,120,1 | ||
58,11,16,131,2 | ||
14,14,40,488,2 | ||
22,23,75,2057,4 | ||
1,7,17,167,1 | ||
56,11,37,398,2 | ||
48,8,15,176,1 | ||
34,11,19,148,1 | ||
63,15,13,101,1 | ||
55,18,52,461,2 | ||
20,13,22,672,3 | ||
7,11,13,144,1 | ||
26,7,11,393,2 | ||
6,7,7,78,1 | ||
57,16,22,133,2 | ||
21,12,26,965,3 | ||
59,16,29,129,2 | ||
37,11,20,26,1 | ||
38,14,20,140,1 | ||
49,9,23,258,1 | ||
41,10,30,555,4 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import os | ||
import pandas | ||
import pickle | ||
from sklearn.tree import DecisionTreeRegressor | ||
|
||
# Read the preprocessed training data CSV into a DataFrame | ||
storybooks_csv_path = '../step1_prepare/step1_3_storybooks_train.csv' | ||
print(os.path.basename(__file__), 'storybooks_csv_path: {}'.format(storybooks_csv_path)) | ||
storybooks_dataframe = pandas.read_csv(storybooks_csv_path) | ||
print(os.path.basename(__file__), 'storybooks_dataframe:\n{}'.format(storybooks_dataframe)) | ||
print(os.path.basename(__file__), 'storybooks_dataframe.columns:\n{}'.format(storybooks_dataframe.columns)) | ||
|
||
# Select the prediction target | ||
y = storybooks_dataframe[['reading_level']] | ||
print(os.path.basename(__file__), 'type(y): {}'.format(type(y))) | ||
print(os.path.basename(__file__), 'y:\n{}'.format(y)) | ||
|
||
# Choose features | ||
X = storybooks_dataframe[['chapter_count', 'paragraph_count', 'word_count']] | ||
print(os.path.basename(__file__), 'type(X): {}'.format(type(X))) | ||
print(os.path.basename(__file__), 'X:\n{}'.format(X)) | ||
|
||
# Define model | ||
print(os.path.basename(__file__), 'Defining model...') | ||
reading_level_model = DecisionTreeRegressor(random_state=1) | ||
print(os.path.basename(__file__), 'reading_level_model: {}'.format(reading_level_model)) | ||
|
||
# Fit model | ||
print(os.path.basename(__file__), 'Fitting model...') | ||
reading_level_model.fit(X, y) | ||
|
||
# Save model | ||
print(os.path.basename(__file__), 'Saving model...') | ||
with open('step2_1_model.pkl', 'wb') as file: | ||
pickle.dump(reading_level_model, file, protocol=5) |
Oops, something went wrong.