Skip to content

Commit

Permalink
Merge pull request #24 from WSU-SEAL/master
Browse files Browse the repository at this point in the history
Adding WSU-SEAL refactored version of data preparation and testing pipeline
  • Loading branch information
naveenr414 authored Jun 29, 2022
2 parents cf8b76e + 25446f4 commit 53237db
Show file tree
Hide file tree
Showing 26 changed files with 7,795 additions and 0 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,8 @@ Then run
import predict_text
predict_text.predict_text("Your text here")
```

# WSU-SEAL refactored implementation

If you are interested retraining the model using a new dataset, please check WSU-SEAL directory.
For any question regarding this refactored version please contact Amiangshu Bosu ([email protected])
28 changes: 28 additions & 0 deletions WSU_SEAL/PPAClient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import requests
import json

api_key = 'YOUR_KEY_HERE' # please do not forget to add your API key here
url = ('https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze' +
'?key=' + api_key
)


def get_api_response(data_dict):

response = requests.post(url=url, data=json.dumps(data_dict))
response_dict = json.loads(response.content)
return response_dict

def get_perspective_api_score(text):
data_dict = {
'comment': {'text': text},
'languages': ['en'],
'requestedAttributes': {'TOXICITY': {}}}
response_dict = get_api_response(data_dict)
#print(response_dict)
toxicity_score =json.dumps(response_dict['attributeScores']['TOXICITY']['summaryScore']['value'])
return toxicity_score


#value =get_perspective_api_score("I am fine.")
#print(value)
48 changes: 48 additions & 0 deletions WSU_SEAL/PretrainedPolitenessModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import sys
import os
import _pickle
import scipy
import numpy as np
from scipy.sparse import csr_matrix

import vectorizer

MODEL_FILENAME = os.path.join(os.path.split(__file__)[0], 'models/wsu-seal-retrain-politeness-svm.p')

# Load model, initialize vectorizer
clf = _pickle.load(open(MODEL_FILENAME, 'rb'), encoding='latin1', fix_imports=True)
vectorizer = vectorizer.PolitenessFeatureVectorizer()

def score(request):
"""
:param request - The request document to score
:type request - dict with 'sentences' and 'parses' field
sample (taken from test_documents.py)--
{
'sentences': [
"Have you found the answer for your question?",
"If yes would you please share it?"
],
'parses': [
["csubj(found-3, Have-1)", "dobj(Have-1, you-2)",
"root(ROOT-0, found-3)", "det(answer-5, the-4)",
"dobj(found-3, answer-5)", "poss(question-8, your-7)",
"prep_for(found-3, question-8)"],
["prep_if(would-3, yes-2)", "root(ROOT-0, would-3)",
"nsubj(would-3, you-4)", "ccomp(would-3, please-5)",
"nsubj(it-7, share-6)", "xcomp(please-5, it-7)"]
]
}
returns class probabilities as a dict
{ 'polite': float, 'impolite': float }
"""
# Vectorizer returns {feature-name: value} dict
features = vectorizer.features(request)
fv = [features[f] for f in sorted(features.keys())]
# Single-row sparse matrix
X = csr_matrix(np.asarray([fv]))
probs = clf.predict_proba(X)
# Massage return format
probs = {"polite": probs[0][1], "impolite": probs[0][0]}
return probs
107 changes: 107 additions & 0 deletions WSU_SEAL/STRUDEL_CV.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn_pandas import DataFrameMapper
import algorithms
import data_cleaner

class STRUDEL_MODEL:
def __init__(self, X_train, Y_train):
self.vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,1),max_features=5000)

self.mapper = None
self.Y = None
self.X = None

self.clf = algorithms.linear_svm_model()
self.__prepare_data(X_train, Y_train)
self.model=self.train()


def __prepare_data(self, X_train, Y_train):
self.mapper = DataFrameMapper([
('text', self.vectorizer),
('num_url', None),
('num_emoji', None),
('num_mention', None),
('nltk_score', None),
('subjectivity', None),
('polarity', None),
('perspective_score', None),
('stanford_polite', None),
])
self.Y = np.ravel(Y_train)

self.X = self.mapper.fit_transform(X_train) # adding the other features with bagofwords

def train(self):
print("Training the model with " + str(len(self.Y)) + " instances and " + str(
self.X.shape[1]) + " features")
self.clf.fit(self.X, self.Y)
print("Model training complete ..")
return self.clf

def predict(self, X_test):
X_test_mapped = self.mapper.transform(X_test)
predictions = self.model.predict(X_test_mapped)
return np.expand_dims(predictions, 1)


def read_dataframe_from_excel(file):
dataframe = pd.read_excel(file)
return dataframe

print("Reading dataset..")
#training_data = read_dataframe_from_excel("models/code_review_preprocessed.xlsx")
training_data = read_dataframe_from_excel("models/STRUDEL-issue-comments-dataset.xlsx")

print("Applying SE domain specific cleaning steps..")
training_data["text"] = training_data.text.astype(str).apply(data_cleaner.clean_text)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=999)

filename = "results/strudel-CV-issue-comments.csv"
#filename = "results/strudel-CV-code-review.csv"
training_log = open(filename, 'w')
training_log.write("Fold,precision_0,recall_0,f-score_0,precision_1,recall_1,f-score_1,accuracy\n")

count =1
results=""
print("Starting 10-fold cross validations..")
for train_index, test_index in kf.split(training_data, training_data["is_toxic"]):

X_train, X_test = training_data.loc[train_index, ["text", "perspective_score", "num_url",
"num_emoji", "num_mention", "nltk_score", "num_reference",
"subjectivity", "polarity", "stanford_polite"]], \
training_data.loc[test_index, ["text", "perspective_score", "num_url",
"num_emoji", "num_mention", "nltk_score", "num_reference",
"subjectivity", "polarity", "stanford_polite"]]

Y_train, Y_test = training_data.loc[train_index, "is_toxic"], training_data.loc[test_index, "is_toxic"]

print("Fold# "+ str(count))
classifier_model = STRUDEL_MODEL(X_train, Y_train)

predictions = classifier_model.predict(X_test)

precision_1 = precision_score(Y_test, predictions, pos_label=1)
recall_1 = recall_score(Y_test, predictions, pos_label=1)
f1score_1 = f1_score(Y_test, predictions, pos_label=1)

precision_0 = precision_score(Y_test, predictions, pos_label=0)
recall_0 = recall_score(Y_test, predictions, pos_label=0)
f1score_0 = f1_score(Y_test, predictions, pos_label=0)
accuracy = accuracy_score(Y_test, predictions)
results = results + str(count) + ","

results = results + str(precision_0) + "," + str(recall_0) + "," + str(f1score_0)
results = results + "," + str(precision_1) + "," + str(recall_1) + "," + str(f1score_1) + \
"," + str(accuracy) + "\n"

print(classification_report(Y_test, predictions))

count += 1
training_log.write(results)
training_log.flush()
Empty file added WSU_SEAL/__init__.py
Empty file.
35 changes: 35 additions & 0 deletions WSU_SEAL/algorithms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# to avoid import errors these methods are copied from src.classifiers.
def bayes_model():
"""Bayes"""
return GaussianNB()

def linear_svm_model(C=10**1.5):
"""Linear SVM"""
return svm.LinearSVC(C=C,max_iter=10000)

def svm_model(C=10**1.5,gamma='scale'):
"""SVM"""
return svm.SVC(gamma=gamma,C=C,probability=True)

def logistic_model(C=1):
"""Logistic"""
return LogisticRegression(C=C,solver='lbfgs',multi_class='multinomial',max_iter=4000)

def decision_tree_model():
"""Decision Tree"""
return tree.DecisionTreeClassifier()

def random_forest_model(n_estimators=100,max_features="auto",max_depth=None,min_samples_leaf=1):
"""Random Forest"""
return RandomForestClassifier(n_estimators=n_estimators,max_features=max_features,min_samples_leaf=min_samples_leaf)

def knn_model(k=5):
"""KNN"""
return KNeighborsClassifier(n_neighbors=k)
76 changes: 76 additions & 0 deletions WSU_SEAL/data_cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import math
import pickle
import re
from wordfreq import word_frequency
from nltk import RegexpTokenizer

from collections import defaultdict

def log_odds(counts1,counts2):
prior = counts2

sigmasquared = defaultdict(float)
sigma = defaultdict(float)
delta = defaultdict(float)

for word in prior.keys():
prior[word] = int(prior[word] + 0.5)

for word in counts2.keys():
counts1[word] = int(counts1[word] + 0.5)
if prior[word] == 0:
prior[word] = 1

for word in counts1.keys():
counts2[word] = int(counts2[word] + 0.5)
if prior[word] == 0:
prior[word] = 1

n1 = sum(counts1.values())
n2 = sum(counts2.values())
nprior = sum(prior.values())

for word in prior.keys():
if prior[word] > 0:
l1 = float(counts1[word] + prior[word]) / (( n1 + nprior ) - (counts1[word] + prior[word]))
l2 = float(counts2[word] + prior[word]) / (( n2 + nprior ) - (counts2[word] + prior[word]))
sigmasquared[word] = 1/(float(counts1[word]) + float(prior[word])) + 1/(float(counts2[word]) + float(prior[word]))
sigma[word] = math.sqrt(sigmasquared[word])
delta[word] = ( math.log(l1) - math.log(l2) ) / sigma[word]

different_words = []

for word in sorted(delta, key=delta.get):
if delta[word]>1.645:
different_words.append(word)

return different_words

counter = pickle.load(open("./models/github_words.p","rb"))
our_words = dict([(i,word_frequency(i,"en")*10**9) for i in counter])
different_words = log_odds(defaultdict(int,counter),defaultdict(int,our_words))


def clean_text(text):
result = []
words = text.split(" ")
words = [a.strip(',.!?:; ') for a in words]

words = list(set(words))
words = [word for word in words if not word.isalpha() or word.lower() in different_words]

for word in set(words):
# Maybe unkify?
result += [re.sub(r'[^a-zA-Z0-9]' + re.escape(word.lower()) + r'[^a-zA-Z0-9]', ' potato ', " "+text.lower()+" ").strip()]

tokenizer = RegexpTokenizer(r'\w+')
all_words = tokenizer.tokenize(text)
# logging.info("all_words "+str(all_words))
# Try removing all unknown words
for word in set(all_words):
if word.lower() not in counter and word_frequency(word.lower(), "en") == 0 and len(word) > 2:
text = text.replace(word, '')

result += [text]
text_mod =' '.join(result)
return text_mod
Binary file not shown.
Binary file added WSU_SEAL/models/all_words.p
Binary file not shown.
Loading

0 comments on commit 53237db

Please sign in to comment.