Merge pull request #24 from WSU-SEAL/master

Adding WSU-SEAL refactored version of data preparation and testing pipeline
CMUSTRUDEL · Jun 29, 2022 · 53237db · 53237db
2 parents cf8b76e + 25446f4
commit 53237db
Show file tree

Hide file tree

Showing 26 changed files with 7,795 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -90,3 +90,8 @@ Then run
 import predict_text
 predict_text.predict_text("Your text here")
 ```
+
+# WSU-SEAL refactored implementation
+
+If you are interested retraining the model using a new dataset, please check WSU-SEAL directory.
+For any question regarding this refactored version please contact Amiangshu Bosu ([email protected])
diff --git a/WSU_SEAL/PPAClient.py b/WSU_SEAL/PPAClient.py
@@ -0,0 +1,28 @@
+import requests
+import  json
+
+api_key = 'YOUR_KEY_HERE' # please do not forget to add your API key here
+url = ('https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze' +
+       '?key=' + api_key
+       )
+
+
+def get_api_response(data_dict):
+
+    response = requests.post(url=url, data=json.dumps(data_dict))
+    response_dict = json.loads(response.content)
+    return  response_dict
+
+def get_perspective_api_score(text):
+    data_dict = {
+        'comment': {'text': text},
+        'languages': ['en'],
+        'requestedAttributes': {'TOXICITY': {}}}
+    response_dict = get_api_response(data_dict)
+    #print(response_dict)
+    toxicity_score =json.dumps(response_dict['attributeScores']['TOXICITY']['summaryScore']['value'])
+    return toxicity_score
+
+
+#value =get_perspective_api_score("I am fine.")
+#print(value)
diff --git a/WSU_SEAL/PretrainedPolitenessModel.py b/WSU_SEAL/PretrainedPolitenessModel.py
@@ -0,0 +1,48 @@
+import sys
+import os
+import _pickle
+import scipy
+import numpy as np
+from scipy.sparse import csr_matrix
+
+import vectorizer
+
+MODEL_FILENAME = os.path.join(os.path.split(__file__)[0], 'models/wsu-seal-retrain-politeness-svm.p')
+
+# Load model, initialize vectorizer
+clf = _pickle.load(open(MODEL_FILENAME, 'rb'), encoding='latin1', fix_imports=True)
+vectorizer = vectorizer.PolitenessFeatureVectorizer()
+
+def score(request):
+    """
+    :param request - The request document to score
+    :type request - dict with 'sentences' and 'parses' field
+        sample (taken from test_documents.py)--
+        {
+            'sentences': [
+                "Have you found the answer for your question?",
+                "If yes would you please share it?"
+            ],
+            'parses': [
+                ["csubj(found-3, Have-1)", "dobj(Have-1, you-2)",
+                 "root(ROOT-0, found-3)", "det(answer-5, the-4)",
+                 "dobj(found-3, answer-5)", "poss(question-8, your-7)",
+                 "prep_for(found-3, question-8)"],
+                ["prep_if(would-3, yes-2)", "root(ROOT-0, would-3)",
+                 "nsubj(would-3, you-4)", "ccomp(would-3, please-5)",
+                 "nsubj(it-7, share-6)", "xcomp(please-5, it-7)"]
+            ]
+        }
+
+    returns class probabilities as a dict
+        { 'polite': float, 'impolite': float }
+    """
+    # Vectorizer returns {feature-name: value} dict
+    features = vectorizer.features(request)
+    fv = [features[f] for f in sorted(features.keys())]
+    # Single-row sparse matrix
+    X = csr_matrix(np.asarray([fv]))
+    probs = clf.predict_proba(X)
+    # Massage return format
+    probs = {"polite": probs[0][1], "impolite": probs[0][0]}
+    return probs
diff --git a/WSU_SEAL/STRUDEL_CV.py b/WSU_SEAL/STRUDEL_CV.py
@@ -0,0 +1,107 @@
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
+from sklearn.model_selection import KFold, StratifiedKFold
+from sklearn_pandas import DataFrameMapper
+import algorithms
+import  data_cleaner
+
+class STRUDEL_MODEL:
+    def __init__(self, X_train, Y_train):
+        self.vectorizer = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,1),max_features=5000)
+
+        self.mapper = None
+        self.Y = None
+        self.X = None
+
+        self.clf = algorithms.linear_svm_model()
+        self.__prepare_data(X_train, Y_train)
+        self.model=self.train()
+
+
+    def __prepare_data(self, X_train, Y_train):
+        self.mapper = DataFrameMapper([
+            ('text', self.vectorizer),
+            ('num_url', None),
+            ('num_emoji', None),
+            ('num_mention', None),
+            ('nltk_score', None),
+            ('subjectivity', None),
+            ('polarity', None),
+            ('perspective_score', None),
+            ('stanford_polite', None),
+        ])
+        self.Y = np.ravel(Y_train)
+
+        self.X = self.mapper.fit_transform(X_train)  # adding the other features with bagofwords
+
+    def train(self):
+        print("Training the model with " + str(len(self.Y)) + " instances and " + str(
+            self.X.shape[1]) + " features")
+        self.clf.fit(self.X, self.Y)
+        print("Model training complete ..")
+        return self.clf
+
+    def predict(self, X_test):
+        X_test_mapped = self.mapper.transform(X_test)
+        predictions = self.model.predict(X_test_mapped)
+        return np.expand_dims(predictions, 1)
+
+
+def read_dataframe_from_excel(file):
+    dataframe = pd.read_excel(file)
+    return dataframe
+
+print("Reading dataset..")
+#training_data = read_dataframe_from_excel("models/code_review_preprocessed.xlsx")
+training_data = read_dataframe_from_excel("models/STRUDEL-issue-comments-dataset.xlsx")
+
+print("Applying SE domain specific cleaning steps..")
+training_data["text"] = training_data.text.astype(str).apply(data_cleaner.clean_text)
+
+kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=999)
+
+filename = "results/strudel-CV-issue-comments.csv"
+#filename = "results/strudel-CV-code-review.csv"
+training_log = open(filename, 'w')
+training_log.write("Fold,precision_0,recall_0,f-score_0,precision_1,recall_1,f-score_1,accuracy\n")
+
+count =1
+results=""
+print("Starting 10-fold cross validations..")
+for train_index, test_index in kf.split(training_data, training_data["is_toxic"]):
+
+    X_train, X_test = training_data.loc[train_index, ["text", "perspective_score",	"num_url",
+                                                      "num_emoji",	"num_mention",	"nltk_score", "num_reference",
+                                                      "subjectivity",	"polarity",	"stanford_polite"]], \
+                      training_data.loc[test_index, ["text", "perspective_score",	"num_url",
+                                                      "num_emoji",	"num_mention",	"nltk_score", "num_reference",
+                                                      "subjectivity",	"polarity",	"stanford_polite"]]
+
+    Y_train, Y_test = training_data.loc[train_index, "is_toxic"], training_data.loc[test_index, "is_toxic"]
+
+    print("Fold# "+ str(count))
+    classifier_model = STRUDEL_MODEL(X_train, Y_train)
+
+    predictions = classifier_model.predict(X_test)
+
+    precision_1 = precision_score(Y_test, predictions, pos_label=1)
+    recall_1 = recall_score(Y_test, predictions, pos_label=1)
+    f1score_1 = f1_score(Y_test, predictions, pos_label=1)
+
+    precision_0 = precision_score(Y_test, predictions, pos_label=0)
+    recall_0 = recall_score(Y_test, predictions, pos_label=0)
+    f1score_0 = f1_score(Y_test, predictions, pos_label=0)
+    accuracy = accuracy_score(Y_test, predictions)
+    results = results + str(count) + ","
+
+    results = results + str(precision_0) + "," + str(recall_0) + "," + str(f1score_0)
+    results = results + "," + str(precision_1) + "," + str(recall_1) + "," + str(f1score_1) + \
+                  "," + str(accuracy)  + "\n"
+
+    print(classification_report(Y_test, predictions))
+
+    count += 1
+training_log.write(results)
+training_log.flush()
diff --git a/WSU_SEAL/__init__.py b/WSU_SEAL/__init__.py
diff --git a/WSU_SEAL/algorithms.py b/WSU_SEAL/algorithms.py
@@ -0,0 +1,35 @@
+from sklearn.naive_bayes import GaussianNB
+from sklearn import svm
+from sklearn.linear_model import LogisticRegression
+from sklearn import tree
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import RandomForestClassifier
+
+# to avoid import errors these methods are copied from src.classifiers.
+def bayes_model():
+    """Bayes"""
+    return GaussianNB()
+
+def linear_svm_model(C=10**1.5):
+    """Linear SVM"""
+    return svm.LinearSVC(C=C,max_iter=10000)
+
+def svm_model(C=10**1.5,gamma='scale'):
+    """SVM"""
+    return svm.SVC(gamma=gamma,C=C,probability=True)
+
+def logistic_model(C=1):
+    """Logistic"""
+    return LogisticRegression(C=C,solver='lbfgs',multi_class='multinomial',max_iter=4000)
+
+def decision_tree_model():
+    """Decision Tree"""
+    return tree.DecisionTreeClassifier()
+
+def random_forest_model(n_estimators=100,max_features="auto",max_depth=None,min_samples_leaf=1):
+    """Random Forest"""
+    return RandomForestClassifier(n_estimators=n_estimators,max_features=max_features,min_samples_leaf=min_samples_leaf)
+
+def knn_model(k=5):
+    """KNN"""
+    return KNeighborsClassifier(n_neighbors=k)
diff --git a/WSU_SEAL/data_cleaner.py b/WSU_SEAL/data_cleaner.py
@@ -0,0 +1,76 @@
+import math
+import pickle
+import re
+from wordfreq import word_frequency
+from nltk import RegexpTokenizer
+
+from collections import defaultdict
+
+def log_odds(counts1,counts2):
+    prior = counts2
+
+    sigmasquared = defaultdict(float)
+    sigma = defaultdict(float)
+    delta = defaultdict(float)
+
+    for word in prior.keys():
+        prior[word] = int(prior[word] + 0.5)
+
+    for word in counts2.keys():
+        counts1[word] = int(counts1[word] + 0.5)
+        if prior[word] == 0:
+            prior[word] = 1
+
+    for word in counts1.keys():
+        counts2[word] = int(counts2[word] + 0.5)
+        if prior[word] == 0:
+            prior[word] = 1
+
+    n1  = sum(counts1.values())
+    n2  = sum(counts2.values())
+    nprior = sum(prior.values())
+
+    for word in prior.keys():
+        if prior[word] > 0:
+            l1 = float(counts1[word] + prior[word]) / (( n1 + nprior ) - (counts1[word] + prior[word]))
+            l2 = float(counts2[word] + prior[word]) / (( n2 + nprior ) - (counts2[word] + prior[word]))
+            sigmasquared[word] =  1/(float(counts1[word]) + float(prior[word])) + 1/(float(counts2[word]) + float(prior[word]))
+            sigma[word] =  math.sqrt(sigmasquared[word])
+            delta[word] = ( math.log(l1) - math.log(l2) ) / sigma[word]
+
+    different_words = []
+
+    for word in sorted(delta, key=delta.get):
+        if delta[word]>1.645:
+            different_words.append(word)
+
+    return different_words
+
+counter = pickle.load(open("./models/github_words.p","rb"))
+our_words = dict([(i,word_frequency(i,"en")*10**9) for i in counter])
+different_words = log_odds(defaultdict(int,counter),defaultdict(int,our_words))
+
+
+def clean_text(text):
+    result = []
+    words = text.split(" ")
+    words = [a.strip(',.!?:; ') for a in words]
+
+    words = list(set(words))
+    words = [word for word in words if not word.isalpha() or word.lower() in different_words]
+
+    for word in set(words):
+        # Maybe unkify?
+        result += [re.sub(r'[^a-zA-Z0-9]' + re.escape(word.lower()) + r'[^a-zA-Z0-9]', ' potato ', " "+text.lower()+" ").strip()]
+
+    tokenizer = RegexpTokenizer(r'\w+')
+    all_words = tokenizer.tokenize(text)
+    # logging.info("all_words "+str(all_words))
+    # Try removing all unknown words
+    for word in set(all_words):
+        if word.lower() not in counter and word_frequency(word.lower(), "en") == 0 and len(word) > 2:
+            text = text.replace(word, '')
+
+    result += [text]
+    text_mod =' '.join(result)
+    return text_mod
diff --git a/WSU_SEAL/models/STRUDEL-issue-comments-dataset.xlsx b/WSU_SEAL/models/STRUDEL-issue-comments-dataset.xlsx
diff --git a/WSU_SEAL/models/all_words.p b/WSU_SEAL/models/all_words.p