Skip to content

Commit

Permalink
Merge pull request #15 from elimu-ai/experimentation
Browse files Browse the repository at this point in the history
Export PMML
  • Loading branch information
jo-elimu authored Aug 19, 2024
2 parents 94d7133 + e1c81b6 commit ba3eb4b
Show file tree
Hide file tree
Showing 23 changed files with 738 additions and 0 deletions.
46 changes: 46 additions & 0 deletions .github/workflows/pmml.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Python application (PMML)

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

permissions:
contents: read

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11"]
defaults:
run:
working-directory: pmml
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest
- name: Run All Steps (1-3)
run: |
python run_all_steps.py
15 changes: 15 additions & 0 deletions pmml/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# ML: Storybook Reading Level Predictor (PMML)

## Code Usage

```python
pip install -r requirements.txt
```

```python
pytest
```

```python
python run_all_steps.py
```
5 changes: 5 additions & 0 deletions pmml/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
numpy==1.26.4
pandas==2.2.2
pypmml==0.9.17
scikit-learn==1.5.0
sklearn2pmml==0.110.0
20 changes: 20 additions & 0 deletions pmml/run_all_steps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import os

print('\n*** Step 1. Prepare Data ***')
os.chdir('step1_prepare')
print(os.path.basename(__file__), 'os.getcwd(): {}'.format(os.getcwd()))
import step1_prepare.step1_1_download_data
import step1_prepare.step1_2_preprocess_data
import step1_prepare.step1_3_split_data

print('\n*** Step 2. Train Model ***')
os.chdir('../step2_train')
print(os.path.basename(__file__), 'os.getcwd(): {}'.format(os.getcwd()))
import step2_train.step2_1_train_model_pkl
import step2_train.step2_2_train_model_pmml

print('\n*** Step 3. Make Prediction ***')
os.chdir('../step3_predict')
print(os.path.basename(__file__), 'os.getcwd(): {}'.format(os.getcwd()))
import step3_predict.step3_1_predict
import step3_predict.step3_2_validate
12 changes: 12 additions & 0 deletions pmml/step1_prepare/step1_1_download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import os
import pandas

# Read the storybooks CSV into a DataFrame
storybooks_csv_url = ('https://raw.githubusercontent.com/elimu-ai/webapp/main/src/main/resources/db/content_PROD'
'/hin/storybooks.csv')
print(os.path.basename(__file__), 'storybooks_csv_url: {}'.format(storybooks_csv_url))
storybooks_dataframe = pandas.read_csv(storybooks_csv_url)
print(os.path.basename(__file__), 'storybooks_dataframe:\n{}'.format(storybooks_dataframe))

# Write the DataFrame to a CSV file
storybooks_dataframe.to_csv('step1_1_storybooks.csv', index=False)
59 changes: 59 additions & 0 deletions pmml/step1_prepare/step1_1_storybooks.csv

Large diffs are not rendered by default.

47 changes: 47 additions & 0 deletions pmml/step1_prepare/step1_2_preprocess_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import json
from os.path import basename, dirname, abspath
import pandas
import sys

parent_dir = dirname(dirname(abspath(__file__)))
sys.path.append(parent_dir)
from utils import chapters_utils

# Read the original storybooks CSV into a DataFrame
storybooks_csv_path = 'step1_1_storybooks.csv'
print(basename(__file__), f'storybooks_csv_path: {storybooks_csv_path}')
storybooks_dataframe = pandas.read_csv(storybooks_csv_path)

# For each row in the DataFrame, extract information from the JSON string stored in the `chapters` column
storybooks_dataframe['chapter_count'] = 0
storybooks_dataframe['paragraph_count'] = 0
storybooks_dataframe['word_count'] = 0
for index in storybooks_dataframe.index:
print(basename(__file__), f'index: {index}')
chapters = storybooks_dataframe.loc[index]['chapters']
chapters_json = json.loads(chapters)

chapter_count = chapters_utils.get_chapter_count(chapters_json)
storybooks_dataframe.loc[index, 'chapter_count'] = chapter_count

paragraph_count = chapters_utils.get_paragraph_count(chapters_json)
storybooks_dataframe.loc[index, 'paragraph_count'] = paragraph_count

word_count = chapters_utils.get_word_count(chapters_json)
storybooks_dataframe.loc[index, 'word_count'] = word_count
print(basename(__file__), f'storybooks_dataframe (after extracting data from `chapters` column): \n{storybooks_dataframe}')

# Drop unnecessary columns
storybooks_dataframe = storybooks_dataframe[['id', 'reading_level', 'chapter_count', 'paragraph_count', 'word_count']]
print(basename(__file__), f'storybooks_dataframe (after dropping unnecessary columns): \n{storybooks_dataframe}')

# Drop missing values
storybooks_dataframe = storybooks_dataframe.dropna()
print(basename(__file__), f'storybooks_dataframe (after dropping missing values): \n{storybooks_dataframe}')

# Extract number from reading level (e.g. 'LEVEL1' --> '1')
storybooks_dataframe['reading_level'] = storybooks_dataframe['reading_level'].str.extract('(\\d+)')
print(basename(__file__), f'storybooks_dataframe (after converting texts to numbers): \n{storybooks_dataframe}')

# Write the DataFrame to a CSV file
storybooks_dataframe.to_csv('step1_2_storybooks.csv', index=False)
59 changes: 59 additions & 0 deletions pmml/step1_prepare/step1_2_storybooks.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
id,reading_level,chapter_count,paragraph_count,word_count
37,1,11,20,26
1,1,7,17,167
36,1,11,29,138
59,2,16,29,129
25,3,19,54,1596
29,1,11,13,110
63,1,15,13,101
2,1,7,11,123
15,3,23,47,1291
6,1,7,7,78
53,2,15,62,486
66,2,15,44,442
56,2,11,37,398
4,1,12,25,117
42,1,15,37,264
61,2,17,29,178
50,2,18,48,811
32,1,10,25,209
11,1,12,31,194
26,2,7,11,393
51,2,19,48,623
7,1,11,13,144
64,2,4,25,85
55,2,18,52,461
34,1,11,19,148
47,1,16,18,117
24,3,14,20,731
27,2,11,23,211
46,1,5,5,81
65,2,12,30,426
60,2,11,42,427
17,3,9,10,374
19,3,15,39,1342
39,1,13,15,177
5,1,11,21,116
28,1,10,17,95
20,3,13,22,672
30,1,13,28,201
22,4,23,75,2057
57,2,16,22,133
33,1,14,13,23
8,1,11,12,150
9,1,11,11,107
16,1,11,13,120
41,4,10,30,555
10,1,6,18,101
58,2,11,16,131
49,1,9,23,258
52,2,20,41,559
18,3,17,38,564
40,1,4,16,89
14,2,14,40,488
23,4,16,55,1507
38,1,14,20,140
54,2,11,28,211
21,3,12,26,965
31,1,4,4,20
48,1,8,15,176
23 changes: 23 additions & 0 deletions pmml/step1_prepare/step1_3_split_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from os.path import basename
import pandas
from sklearn.model_selection import train_test_split

# Read the preprocessed storybooks CSV into a DataFrame
storybooks_csv_path = 'step1_2_storybooks.csv'
print(basename(__file__), f'storybooks_csv_path: {storybooks_csv_path}')
storybooks_dataframe = pandas.read_csv(storybooks_csv_path)

# Split the data into training and validation data
X = storybooks_dataframe[['id', 'chapter_count', 'paragraph_count', 'word_count']]
y = storybooks_dataframe[['reading_level']]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)
print(basename(__file__), f'train_X: \n{train_X}')
print(basename(__file__), f'train_y: \n{train_y}')
print(basename(__file__), f'test_X: \n{test_X}')
print(basename(__file__), f'test_y: \n{test_y}')

# Write the DataFrames to CSV files
storybooks_dataframe_train = pandas.concat([train_X, train_y], axis=1)
storybooks_dataframe_train.to_csv('step1_3_storybooks_train.csv', index=False)
storybooks_dataframe_test = pandas.concat([test_X, test_y], axis=1)
storybooks_dataframe_test.to_csv('step1_3_storybooks_test.csv', index=False)
13 changes: 13 additions & 0 deletions pmml/step1_prepare/step1_3_storybooks_test.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
id,chapter_count,paragraph_count,word_count,reading_level
28,10,17,95,1
5,11,21,116,1
9,11,11,107,1
27,11,23,211,2
66,15,44,442,2
36,11,29,138,1
39,13,15,177,1
10,6,18,101,1
64,4,25,85,2
52,20,41,559,2
25,19,54,1596,3
53,15,62,486,2
47 changes: 47 additions & 0 deletions pmml/step1_prepare/step1_3_storybooks_train.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
id,chapter_count,paragraph_count,word_count,reading_level
65,12,30,426,2
33,14,13,23,1
19,15,39,1342,3
8,11,12,150,1
30,13,28,201,1
2,7,11,123,1
42,15,37,264,1
17,9,10,374,3
46,5,5,81,1
31,4,4,20,1
23,16,55,1507,4
11,12,31,194,1
54,11,28,211,2
24,14,20,731,3
61,17,29,178,2
29,11,13,110,1
60,11,42,427,2
50,18,48,811,2
18,17,38,564,3
51,19,48,623,2
40,4,16,89,1
15,23,47,1291,3
4,12,25,117,1
47,16,18,117,1
32,10,25,209,1
16,11,13,120,1
58,11,16,131,2
14,14,40,488,2
22,23,75,2057,4
1,7,17,167,1
56,11,37,398,2
48,8,15,176,1
34,11,19,148,1
63,15,13,101,1
55,18,52,461,2
20,13,22,672,3
7,11,13,144,1
26,7,11,393,2
6,7,7,78,1
57,16,22,133,2
21,12,26,965,3
59,16,29,129,2
37,11,20,26,1
38,14,20,140,1
49,9,23,258,1
41,10,30,555,4
Binary file added pmml/step2_train/step2_1_model.pkl
Binary file not shown.
35 changes: 35 additions & 0 deletions pmml/step2_train/step2_1_train_model_pkl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
import pandas
import pickle
from sklearn.tree import DecisionTreeRegressor

# Read the preprocessed training data CSV into a DataFrame
storybooks_csv_path = '../step1_prepare/step1_3_storybooks_train.csv'
print(os.path.basename(__file__), 'storybooks_csv_path: {}'.format(storybooks_csv_path))
storybooks_dataframe = pandas.read_csv(storybooks_csv_path)
print(os.path.basename(__file__), 'storybooks_dataframe:\n{}'.format(storybooks_dataframe))
print(os.path.basename(__file__), 'storybooks_dataframe.columns:\n{}'.format(storybooks_dataframe.columns))

# Select the prediction target
y = storybooks_dataframe[['reading_level']]
print(os.path.basename(__file__), 'type(y): {}'.format(type(y)))
print(os.path.basename(__file__), 'y:\n{}'.format(y))

# Choose features
X = storybooks_dataframe[['chapter_count', 'paragraph_count', 'word_count']]
print(os.path.basename(__file__), 'type(X): {}'.format(type(X)))
print(os.path.basename(__file__), 'X:\n{}'.format(X))

# Define model
print(os.path.basename(__file__), 'Defining model...')
reading_level_model = DecisionTreeRegressor(random_state=1)
print(os.path.basename(__file__), 'reading_level_model: {}'.format(reading_level_model))

# Fit model
print(os.path.basename(__file__), 'Fitting model...')
reading_level_model.fit(X, y)

# Save model
print(os.path.basename(__file__), 'Saving model...')
with open('step2_1_model.pkl', 'wb') as file:
pickle.dump(reading_level_model, file, protocol=5)
Loading

0 comments on commit ba3eb4b

Please sign in to comment.