-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #24 from WSU-SEAL/master
Adding WSU-SEAL refactored version of data preparation and testing pipeline
- Loading branch information
Showing
26 changed files
with
7,795 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,3 +90,8 @@ Then run | |
import predict_text | ||
predict_text.predict_text("Your text here") | ||
``` | ||
|
||
# WSU-SEAL refactored implementation | ||
|
||
If you are interested retraining the model using a new dataset, please check WSU-SEAL directory. | ||
For any question regarding this refactored version please contact Amiangshu Bosu ([email protected]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import requests | ||
import json | ||
|
||
api_key = 'YOUR_KEY_HERE' # please do not forget to add your API key here | ||
url = ('https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze' + | ||
'?key=' + api_key | ||
) | ||
|
||
|
||
def get_api_response(data_dict): | ||
|
||
response = requests.post(url=url, data=json.dumps(data_dict)) | ||
response_dict = json.loads(response.content) | ||
return response_dict | ||
|
||
def get_perspective_api_score(text): | ||
data_dict = { | ||
'comment': {'text': text}, | ||
'languages': ['en'], | ||
'requestedAttributes': {'TOXICITY': {}}} | ||
response_dict = get_api_response(data_dict) | ||
#print(response_dict) | ||
toxicity_score =json.dumps(response_dict['attributeScores']['TOXICITY']['summaryScore']['value']) | ||
return toxicity_score | ||
|
||
|
||
#value =get_perspective_api_score("I am fine.") | ||
#print(value) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import sys | ||
import os | ||
import _pickle | ||
import scipy | ||
import numpy as np | ||
from scipy.sparse import csr_matrix | ||
|
||
import vectorizer | ||
|
||
MODEL_FILENAME = os.path.join(os.path.split(__file__)[0], 'models/wsu-seal-retrain-politeness-svm.p') | ||
|
||
# Load model, initialize vectorizer | ||
clf = _pickle.load(open(MODEL_FILENAME, 'rb'), encoding='latin1', fix_imports=True) | ||
vectorizer = vectorizer.PolitenessFeatureVectorizer() | ||
|
||
def score(request): | ||
""" | ||
:param request - The request document to score | ||
:type request - dict with 'sentences' and 'parses' field | ||
sample (taken from test_documents.py)-- | ||
{ | ||
'sentences': [ | ||
"Have you found the answer for your question?", | ||
"If yes would you please share it?" | ||
], | ||
'parses': [ | ||
["csubj(found-3, Have-1)", "dobj(Have-1, you-2)", | ||
"root(ROOT-0, found-3)", "det(answer-5, the-4)", | ||
"dobj(found-3, answer-5)", "poss(question-8, your-7)", | ||
"prep_for(found-3, question-8)"], | ||
["prep_if(would-3, yes-2)", "root(ROOT-0, would-3)", | ||
"nsubj(would-3, you-4)", "ccomp(would-3, please-5)", | ||
"nsubj(it-7, share-6)", "xcomp(please-5, it-7)"] | ||
] | ||
} | ||
returns class probabilities as a dict | ||
{ 'polite': float, 'impolite': float } | ||
""" | ||
# Vectorizer returns {feature-name: value} dict | ||
features = vectorizer.features(request) | ||
fv = [features[f] for f in sorted(features.keys())] | ||
# Single-row sparse matrix | ||
X = csr_matrix(np.asarray([fv])) | ||
probs = clf.predict_proba(X) | ||
# Massage return format | ||
probs = {"polite": probs[0][1], "impolite": probs[0][0]} | ||
return probs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report | ||
from sklearn.model_selection import KFold, StratifiedKFold | ||
from sklearn_pandas import DataFrameMapper | ||
import algorithms | ||
import data_cleaner | ||
|
||
class STRUDEL_MODEL: | ||
def __init__(self, X_train, Y_train): | ||
self.vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,1),max_features=5000) | ||
|
||
self.mapper = None | ||
self.Y = None | ||
self.X = None | ||
|
||
self.clf = algorithms.linear_svm_model() | ||
self.__prepare_data(X_train, Y_train) | ||
self.model=self.train() | ||
|
||
|
||
def __prepare_data(self, X_train, Y_train): | ||
self.mapper = DataFrameMapper([ | ||
('text', self.vectorizer), | ||
('num_url', None), | ||
('num_emoji', None), | ||
('num_mention', None), | ||
('nltk_score', None), | ||
('subjectivity', None), | ||
('polarity', None), | ||
('perspective_score', None), | ||
('stanford_polite', None), | ||
]) | ||
self.Y = np.ravel(Y_train) | ||
|
||
self.X = self.mapper.fit_transform(X_train) # adding the other features with bagofwords | ||
|
||
def train(self): | ||
print("Training the model with " + str(len(self.Y)) + " instances and " + str( | ||
self.X.shape[1]) + " features") | ||
self.clf.fit(self.X, self.Y) | ||
print("Model training complete ..") | ||
return self.clf | ||
|
||
def predict(self, X_test): | ||
X_test_mapped = self.mapper.transform(X_test) | ||
predictions = self.model.predict(X_test_mapped) | ||
return np.expand_dims(predictions, 1) | ||
|
||
|
||
def read_dataframe_from_excel(file): | ||
dataframe = pd.read_excel(file) | ||
return dataframe | ||
|
||
print("Reading dataset..") | ||
#training_data = read_dataframe_from_excel("models/code_review_preprocessed.xlsx") | ||
training_data = read_dataframe_from_excel("models/STRUDEL-issue-comments-dataset.xlsx") | ||
|
||
print("Applying SE domain specific cleaning steps..") | ||
training_data["text"] = training_data.text.astype(str).apply(data_cleaner.clean_text) | ||
|
||
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=999) | ||
|
||
filename = "results/strudel-CV-issue-comments.csv" | ||
#filename = "results/strudel-CV-code-review.csv" | ||
training_log = open(filename, 'w') | ||
training_log.write("Fold,precision_0,recall_0,f-score_0,precision_1,recall_1,f-score_1,accuracy\n") | ||
|
||
count =1 | ||
results="" | ||
print("Starting 10-fold cross validations..") | ||
for train_index, test_index in kf.split(training_data, training_data["is_toxic"]): | ||
|
||
X_train, X_test = training_data.loc[train_index, ["text", "perspective_score", "num_url", | ||
"num_emoji", "num_mention", "nltk_score", "num_reference", | ||
"subjectivity", "polarity", "stanford_polite"]], \ | ||
training_data.loc[test_index, ["text", "perspective_score", "num_url", | ||
"num_emoji", "num_mention", "nltk_score", "num_reference", | ||
"subjectivity", "polarity", "stanford_polite"]] | ||
|
||
Y_train, Y_test = training_data.loc[train_index, "is_toxic"], training_data.loc[test_index, "is_toxic"] | ||
|
||
print("Fold# "+ str(count)) | ||
classifier_model = STRUDEL_MODEL(X_train, Y_train) | ||
|
||
predictions = classifier_model.predict(X_test) | ||
|
||
precision_1 = precision_score(Y_test, predictions, pos_label=1) | ||
recall_1 = recall_score(Y_test, predictions, pos_label=1) | ||
f1score_1 = f1_score(Y_test, predictions, pos_label=1) | ||
|
||
precision_0 = precision_score(Y_test, predictions, pos_label=0) | ||
recall_0 = recall_score(Y_test, predictions, pos_label=0) | ||
f1score_0 = f1_score(Y_test, predictions, pos_label=0) | ||
accuracy = accuracy_score(Y_test, predictions) | ||
results = results + str(count) + "," | ||
|
||
results = results + str(precision_0) + "," + str(recall_0) + "," + str(f1score_0) | ||
results = results + "," + str(precision_1) + "," + str(recall_1) + "," + str(f1score_1) + \ | ||
"," + str(accuracy) + "\n" | ||
|
||
print(classification_report(Y_test, predictions)) | ||
|
||
count += 1 | ||
training_log.write(results) | ||
training_log.flush() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
from sklearn.naive_bayes import GaussianNB | ||
from sklearn import svm | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn import tree | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from sklearn.ensemble import RandomForestClassifier | ||
|
||
# to avoid import errors these methods are copied from src.classifiers. | ||
def bayes_model(): | ||
"""Bayes""" | ||
return GaussianNB() | ||
|
||
def linear_svm_model(C=10**1.5): | ||
"""Linear SVM""" | ||
return svm.LinearSVC(C=C,max_iter=10000) | ||
|
||
def svm_model(C=10**1.5,gamma='scale'): | ||
"""SVM""" | ||
return svm.SVC(gamma=gamma,C=C,probability=True) | ||
|
||
def logistic_model(C=1): | ||
"""Logistic""" | ||
return LogisticRegression(C=C,solver='lbfgs',multi_class='multinomial',max_iter=4000) | ||
|
||
def decision_tree_model(): | ||
"""Decision Tree""" | ||
return tree.DecisionTreeClassifier() | ||
|
||
def random_forest_model(n_estimators=100,max_features="auto",max_depth=None,min_samples_leaf=1): | ||
"""Random Forest""" | ||
return RandomForestClassifier(n_estimators=n_estimators,max_features=max_features,min_samples_leaf=min_samples_leaf) | ||
|
||
def knn_model(k=5): | ||
"""KNN""" | ||
return KNeighborsClassifier(n_neighbors=k) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import math | ||
import pickle | ||
import re | ||
from wordfreq import word_frequency | ||
from nltk import RegexpTokenizer | ||
|
||
from collections import defaultdict | ||
|
||
def log_odds(counts1,counts2): | ||
prior = counts2 | ||
|
||
sigmasquared = defaultdict(float) | ||
sigma = defaultdict(float) | ||
delta = defaultdict(float) | ||
|
||
for word in prior.keys(): | ||
prior[word] = int(prior[word] + 0.5) | ||
|
||
for word in counts2.keys(): | ||
counts1[word] = int(counts1[word] + 0.5) | ||
if prior[word] == 0: | ||
prior[word] = 1 | ||
|
||
for word in counts1.keys(): | ||
counts2[word] = int(counts2[word] + 0.5) | ||
if prior[word] == 0: | ||
prior[word] = 1 | ||
|
||
n1 = sum(counts1.values()) | ||
n2 = sum(counts2.values()) | ||
nprior = sum(prior.values()) | ||
|
||
for word in prior.keys(): | ||
if prior[word] > 0: | ||
l1 = float(counts1[word] + prior[word]) / (( n1 + nprior ) - (counts1[word] + prior[word])) | ||
l2 = float(counts2[word] + prior[word]) / (( n2 + nprior ) - (counts2[word] + prior[word])) | ||
sigmasquared[word] = 1/(float(counts1[word]) + float(prior[word])) + 1/(float(counts2[word]) + float(prior[word])) | ||
sigma[word] = math.sqrt(sigmasquared[word]) | ||
delta[word] = ( math.log(l1) - math.log(l2) ) / sigma[word] | ||
|
||
different_words = [] | ||
|
||
for word in sorted(delta, key=delta.get): | ||
if delta[word]>1.645: | ||
different_words.append(word) | ||
|
||
return different_words | ||
|
||
counter = pickle.load(open("./models/github_words.p","rb")) | ||
our_words = dict([(i,word_frequency(i,"en")*10**9) for i in counter]) | ||
different_words = log_odds(defaultdict(int,counter),defaultdict(int,our_words)) | ||
|
||
|
||
def clean_text(text): | ||
result = [] | ||
words = text.split(" ") | ||
words = [a.strip(',.!?:; ') for a in words] | ||
|
||
words = list(set(words)) | ||
words = [word for word in words if not word.isalpha() or word.lower() in different_words] | ||
|
||
for word in set(words): | ||
# Maybe unkify? | ||
result += [re.sub(r'[^a-zA-Z0-9]' + re.escape(word.lower()) + r'[^a-zA-Z0-9]', ' potato ', " "+text.lower()+" ").strip()] | ||
|
||
tokenizer = RegexpTokenizer(r'\w+') | ||
all_words = tokenizer.tokenize(text) | ||
# logging.info("all_words "+str(all_words)) | ||
# Try removing all unknown words | ||
for word in set(all_words): | ||
if word.lower() not in counter and word_frequency(word.lower(), "en") == 0 and len(word) > 2: | ||
text = text.replace(word, '') | ||
|
||
result += [text] | ||
text_mod =' '.join(result) | ||
return text_mod |
Binary file not shown.
Binary file not shown.
Oops, something went wrong.