implementation.R

library(car)
library("data.table")
library("assertthat")
library("fst")
library(stringr)
library(plyr)
library("ggplot2")
library(ggrepel)
library(scales)
library(xtable)
library(MASS)
library(gplots)
library(dplyr)
library(gridExtra)
library(grid)
library(tidyr)


# Initializes the environment for executing the scripts we have. The script is identified by the name, which also dubs as the directory in which all outputs of the script will be stored. This function makes sure such a directory exists and clears any previous contents so that each execution starts fresh and results from older runs will not be mixed. 
initializeEnvironment = function(envDir = NULL) {
    if (is.null(envDir)) {
        if (! exists("WORKING_DIR"))
            stop("WORKING_DIR must be specified if default argument for envDir is to be used when initializing the script environment")
        envDir = WORKING_DIR
    } else {
        WORKING_DIR <<- envDir
    }
    cat("WORKING_DIR: ", envDir, "\n")
    system(paste("rm -rf ", envDir, "/Figures", sep=""))
    system(paste("mkdir -p ", envDir,"/Figures", sep = ""))
    system(paste("rm -rf ", envDir, "/Data", sep = ""))
    system(paste("mkdir -p ", envDir,"/Data", sep = ""))
    LATEX_LOGFILE <<- paste0(envDir, "/Data/data.tex")
    CSV_LOGFILE <<- paste0(envDir, "/Data/data.csv")
}

# A custom assertion function for testing which we use to make sure certain assumptions about the data or analysis results are valid. 
check <- assert_that <- function( e ) { 
    cat(paste("Assertion: ", deparse(substitute(e)), ": ", c("Fail", "Pass")[[(e == T) + 1]], "\n"))
} 

# To make sure all our data is autogenerated, we provide means to store named variables and their values in both LaTeX and CSV formats via this function. 
out = function(var, val, suppress = F) {
    cat(paste0(var, ": ", val, "\n"))
    if (! suppress) {
        cat(paste0("\u{005C}newcommand{\u{005C}", var, "}{", prettyNum(val, big.mark = ",", scientific = F), "\u{005C}xspace}\n"), file = LATEX_LOGFILE, append = T)
        cat(paste0(var, ",", val, "\n"), file = CSV_LOGFILE, append = T)
    }
}

# Data Acquisition

# Loads the initial input file containing a row for each unique commit & file name
loadEverything = function() {
    everything_with_css_and_shell <- fread(file="./original/sqlDump/DATA/_everything.csv", showProgress=FALSE)
    assert_that(nrow(everything_with_css_and_shell)==5102488); 
    # remove CSS and shell files from everything since these are not used in the paper at all
    everything = everything_with_css_and_shell %>% filter(tag != "css", tag != "shell")
    assert_that(nrow(everything) == 4989792)
    everything
}

# Loads the processed input file containing a row for each commit & language
loadNewSha = function() {
    newSha <- fread(file="./original/sqlDump/DATA/_newSha.csv", showProgress=FALSE)
    assert_that(nrow(newSha)==1578165)
    newSha
}

# Language Categories

classifyLanguageAsOriginal = function(languages) {
    is_procedural <- function(l)  l %in% c("C++", "Java", "Objective-C", "C", "C#", "Go")
    is_scripting <- function(l) l %in% c("Coffeescript", "Javascript", "Python", "Perl", "Php", "Ruby", "Typescript")
    is_functional <- function(l) l %in% c("Clojure", "Erlang", "Haskell", "Scala")
    is_static <- function( l ) l %in% c("C", "C++", "C#", "Objective-C", "Java", "Go", "Haskell", "Scala")
    is_dynamic <- function( l ) l %in% c("Coffeescript", "Javascript", "Python", "Perl", "Php", "Ruby", "Clojure", "Erlang", "Typescript")
    is_strong <- function(l) l %in% c("C#", "Java", "Go", "Python", "Ruby", "Clojure", "Erlang", "Haskell", "Scala", "Typescript")
    is_weak <- function(l) l %in% c("C", "C++", "Objective-C", "Coffeescript", "Javascript", "Perl", "Php")
    is_unmanaged <- function(l) l %in% c("C", "C++", "Objective-C")
    is_managed <- function(l) l %in% c("Coffeescript", "Javascript", "Perl", "Php", "Ruby", "Clojure", "Erlang", "Java", "Python", "Haskell", "C#", "Go", "Scala", "Typescript")
    
    combine <- function(l) {
        if( is_functional(l) & is_static(l) & is_strong(l) & is_managed(l)) { "Fun Sta Str Man" } 
        else if(is_functional(l) & is_dynamic(l) & is_strong(l) & is_managed(l)) { "Fun Dyn Str Man" }
        else if(is_procedural(l) & is_static(l) & is_strong(l) & is_managed(l)) { "Pro Sta Str Man" }
        else if(is_scripting(l) & is_dynamic(l) & is_strong(l) & is_managed(l)) { "Scr Dyn Str Man" }
        else if(is_scripting(l) & is_dynamic(l) & is_weak(l) & is_managed(l)) { "Scr Dyn Wea Man" }
        else if(is_procedural(l) & is_static(l) & is_weak(l) & is_unmanaged(l)) { "Pro Sta Wea Unm" }
        else "Other"
    }
    
    mapply(combine, as.character(languages))
}

classifyLanguageCorrected = function(languages) {
    is_procedural <- function(l)  l %in% c("C", "C++", "C#", "Java", "Go")
    is_scripting <- function(l) l %in% c("Coffeescript", "Javascript", "Python", "Perl", "Php", "Ruby", "Typescript")
    is_functional <- function(l) l %in% c("Clojure", "Erlang", "Haskell")
    is_static <- function( l ) l %in% c( "C", "C++", "C#", "Java", "Go", "Haskell")
    is_dynamic <- function( l ) l %in% c( "Coffeescript", "Javascript", "Python", "Perl", "Php", "Ruby", "Clojure", "Erlang", "Typescript")
    is_strong <- function(l) l %in% c("C#", "Java", "Go", "Haskell")
    is_weak <- function(l) l %in% c("C", "C++", "Python", "Ruby", "Clojure", "Erlang", "Perl", "Php", "Javascript", "Coffeescript", "Typescript")
    is_unmanaged <- function(l)  l %in% c("C", "C++")
    is_managed <- function(l) l %in% c("Python", "Ruby", "Clojure", "Erlang", "Perl", "Php", "Javascript", "Coffeescript", "C#", "Haskell", "Java", "Go", "Typescript")
    
    combine <- function(l) {
        if( is_functional(l) & is_static(l) & is_strong(l) & is_managed(l)) { "Fun Sta Str Man" } # Haskell
        else if(is_functional(l) & is_dynamic(l) & is_weak(l) & is_managed(l)) { "Fun Dyn Wea Man" } # Clojure Erlang
        else if(is_procedural(l) & is_static(l) & is_strong(l) & is_managed(l)) { "Pro Sta Str Man" } # C# Java Go
        else if(is_scripting(l) & is_dynamic(l) & is_weak(l) & is_managed(l)) { "Scr Dyn Wea Man" } # Python Perl Php Ruby Javascript Coffeescript
        else if(is_procedural(l) & is_static(l) & is_weak(l) & is_unmanaged(l)) { "Pro Sta Wea Unm" } # C/C++
        else "Other" # Scala and Objective-C
    }

    mapply(combine, as.character(languages))
}

# Summarizing data per language 

summarizeByLanguage = function(what) {
    what %>% 
        group_by(project, language) %>%
        dplyr::summarize(
            commits = n_distinct(sha),
            tins = sum(insertion),
            max_commit_age = max(commit_age),
            bcommits = sum(isbug),
            combined = unique(combined), 
            domain = unique(domain),
            devs = n_distinct(devs)
        )
}

logTransform = function(what, log1 = log, log2 = log) {
    data.frame(
        language = what$language, 
        ldevs = log1(what$devs),
        lcommits=log1(what$commits),
        ltins=log2(what$tins),
        lmax_commit_age=log1(what$max_commit_age),
        lbcommits=log2(what$bcommits + 0.5*(what$bcommits==0)),
        bcommits=what$bcommits,
        combined=factor(what$combined),
        domain=factor(what$domain),
        domain_r = relevel(what$domain, rev(levels(what$domain))[1]),
        language_r = relevel(what$language, rev(levels(what$language))[1]),
        commits = what$commits,
        combined_r = relevel(what$combined, rev(levels(what$combined))[1])
    )
}

# Weighted contrasts as described and used by the authors of the original paper
contr.Weights <- function(fac)
{
    fDist=summary(fac)
    fSum=contr.sum(levels(fac))		
    fSum[nrow(fSum),] = -fDist[1:ncol(fSum)]/fDist[length(fDist)]
    fSum
}

getModelRowNames = function(model, var) {
    controlVariables = 4
    rownames = c(dimnames(summary(model)$coefficients)[[1]][1:(1 + controlVariables)], names(summary(var)))
    names(rownames) = rownames
    rownames[["(Intercept)"]] = "Intercept"
    rownames[["lmax_commit_age"]] = "log age"
    rownames[["ltins"]] = "log size"
    rownames[["ldevs"]] = "log devs"
    rownames[["lcommits"]] = "log commits"
    rownames
}

# Takes the glm model and the releveled second model for the last observation and combines them together returning a single data frame
combineModels = function(model, model_r, var, pValAdjust = "none") {
    controlVariables = 4
    s = summary(model)$coefficients
    s_r = summary(model_r)$coefficients
    rownames = getModelRowNames(model, var)
    coef = round(c(s[,1], s_r[controlVariables + 2, 1]), 2)
    se = round(c(s[,2], s_r[controlVariables + 2, 2]), 2)
    pVal = c(s[,4], s_r[controlVariables + 2, 4])
    if (pValAdjust == "bonferroni" || pValAdjust == "fdr")
        pVal[(controlVariables + 2):length(pVal)] = p.adjust(pVal[(controlVariables + 2):length(pVal)], pValAdjust)
    #pVal = round(pVal, 3)
    names(coef) = rownames
    data.frame(
        coef, 
        se,
        pVal
    )
} 


checkPValues = function(data, baseline, pVal, naVal = TRUE) {
    x = data[[pVal]] <= data[[baseline]]
    x[is.na(x)] = naVal
    x[is.na(data[[baseline]])] = T
    x[1:5] = TRUE # control variables are always true
    x
}

checkPValuesLevel = function(data, baseline, pVal, level, naVal = TRUE) {
    x = rep(F, nrow(data))
    # first invalidate all rows where baseline makes prediction
    x[!is.na(data[[baseline]])] = F
    # now those where pValue is greated than given threshold are true
    x[data[[pVal]] <= level] = T
    x[is.na(data[[baseline]])] = T
    x[is.na(x)] = naVal
    x[1:5] = TRUE # control variables are always true
    x
}

checkSignificance = function(data, baseline, sig, naVal = TRUE) {
    x = data[[sig]]
    x[is.na(data[[baseline]])] = T
    x[is.na(x)] = naVal
    x[1:5] = TRUE # control variables are always true
    x
}

lessThanPv = function(x) {
    x[is.na(x)] = "--"
    x[x != "--" & x != ""] = paste("<", x[x != "--" & x!= ""], sep = "")
    x
}

lessThanPvCheckNAFail = function(x, against) {
    x[is.na(x)] = "--"
    x[x == "NA"] = "--"
    x[! is.na(against) & x != "--" & x < against] = paste("<", against[! is.na(against) & x != "--" & x < against], sep="")
    x[(x != "--") & (is.na(against) | x >= against)] = paste("\\hphantom{<}", x[(x != "--") & (is.na(against) | x >= against)], sep="")
    x
}

lessThanPvCheck01 = function(x) {
    x[is.na(x)] = "--"
    x[x == "NA"] = "--"
    x[x != "--" & x < 0.01] = "<0.01"
    x[(x != "--") & x >= 0.01] = paste("\\hphantom{<}", x[(x != "--") & x >= 0.01], sep="")
    x
}

lessThanPvCheck001 = function(x) {
    x[is.na(x)] = "--"
    x[x == "NA"] = "--"
    x[x != "--" & x < 0.001] = "<0.001"
    x[(x != "--") & x >= 0.001] = paste("\\hphantom{<}", x[(x != "--") & x >= 0.001], sep="")
    x
}

prependTyldeToPositiveValues = function(x) {
    x[is.na(x)] = ""
    x[x>=0] = paste("~", x[x >= 0], sep = "")
    x
}

nasToDashes = function(x) {
    x[is.na(x)] = "--"
    x[x == "NA"] = "--"
    x[x == ""] = "--"
    x
}


latexPassFailCell = function(x, pass) {
    PASS = "\\cellcolor{white}"
    FAIL = "\\cellcolor{gray!25}"
    pass = c(FAIL, PASS)[as.numeric(pass) + 1]
    paste(pass, x)
}

latexSanitizer = function(x) {
    gsub("\\#","\\\\#",x)
}

mergeDataFrames = function(d1, d2) {
    result = merge(d1, d2, by = 0, all = T, sort = F)
    n = result$Row.names
    result = result %>% dplyr::select(-(Row.names))
    rownames(result) = n
    result
}

# names || original authors || Repetition
#       || FSE   | CACM     || none
# Coef & pVal
output_RQ1_table_repetition = function(result) {
    
    validCACM = checkPValuesLevel(result, "FSE_pv", "CACM_pv", 0.05)
    validRepetition = checkPValues(result, "FSE_pv", "repetition_pv")
    validClean = checkPValues(result, "FSE_pv", "clean_pv", naVal = F)
    validFdr = checkPValuesLevel(result, "FSE_pv", "adjusted_fdr", 0.01)
    validBonf = checkPValuesLevel(result, "FSE_pv","adjusted_bonf", 0.01)
    validZeroSum = checkPValuesLevel(result, "FSE_pv", "zeroSum_pv", 0.01)
    validBootstrap = checkSignificance(result, "FSE_pv", "bootstrap_sig", naVal = F)
    
    
    result$FSE_coef = prependTyldeToPositiveValues(result$FSE_coef)
    result$CACM_coef = prependTyldeToPositiveValues(result$CACM_coef)
    result$CACM_pv = lessThanPv(result$CACM_pv)
    result$repetition_coef = prependTyldeToPositiveValues(result$repetition_coef)

    result$CACM_coef = latexPassFailCell(result$CACM_coef, validCACM)
    result$CACM_pv = latexPassFailCell(result$CACM_pv, validCACM)
    result$repetition_coef = latexPassFailCell(result$repetition_coef, validRepetition)
    result$repetition_pv = lessThanPvCheck(result$repetition_pv, result$FSE_pv)
    result$repetition_pv = latexPassFailCell(result$repetition_pv, validRepetition)

    
    result$FSE_pv = lessThanPv(result$FSE_pv)
    
    result$clean_coef = NULL    
    result$clean_pv = NULL    
    result$adjusted_fdr = NULL    
    result$adjusted_bonf = NULL    
    result$zeroSum_coef = NULL    
    result$zeroSum_pv = NULL    
    result$bootstrap_coef = NULL    
    result$bootstrap_sig = NULL  

    addtorow <- list()
    addtorow$pos <- list(0,0,0,5)
    addtorow$command <- c(
        "\\rule{0pt}{3ex} & \\multicolumn{4}{c||}{\\normalsize Original Authors}  & \\multicolumn{2}{c}{\\normalsize Repetition}\\\\[1mm]",
        "& \\multicolumn{2}{c|}{(a) FSE~\\cite{ray14}} & \\multicolumn{2}{c||}{(b) CACM~\\cite{ray17}} & \\multicolumn{2}{c}{(c)}\\\\",
        "& Coef & P-val & Coef & P-val & Coef & P-val\\\\",
        "\\hline\n\\hline\n")
    
    t = xtable(result)
    
    align(t) = c("@{}r","||l","l|","l","l","||l","l@{}")
    print(t, add.to.row = addtorow, include.colnames = F, size = "small", file = "./artifact/permutations/languages_table_repetition.tex", sanitize.text.function = latexSanitizer, scalebox = .9, floating = F)
}

output_RQ1_table_reanalysis = function(result) {
    validClean = checkPValues(result, "FSE_pv", "clean_pv", naVal = F)
    #validFdr = checkPValuesLevel(result, "FSE_pv", "adjusted_fdr", 0.01)
    #validBonf = checkPValuesLevel(result, "FSE_pv","adjusted_bonf", 0.01)
    #validZeroSum = checkPValuesLevel(result, "FSE_pv", "zeroSum_pv", 0.01)
    validFdr = checkPValues(result, "FSE_pv", "adjusted_fdr", naVal = F)
    validBonf = checkPValues(result, "FSE_pv","adjusted_bonf", naVal = F)
    validZeroSum = checkPValues(result, "FSE_pv", "zeroSum_pv", naVal = F)
    validBootstrap = checkSignificance(result, "FSE_pv", "bootstrap_sig", naVal = F)
    
    
    result$FSE_coef = prependTyldeToPositiveValues(result$FSE_coef)
    result$clean_coef = nasToDashes(prependTyldeToPositiveValues(result$clean_coef))
    result$clean_pv = nasToDashes(result$clean_pv)
    result$adjusted_fdr = nasToDashes(result$adjusted_fdr)
    result$adjusted_bonf = nasToDashes(result$adjusted_bonf)
    result$zeroSum_coef = nasToDashes(prependTyldeToPositiveValues(result$zeroSum_coef))
    result$zeroSum_pv = nasToDashes(result$zeroSum_pv)
    result$bootstrap_coef = nasToDashes(prependTyldeToPositiveValues(result$bootstrap_coef))
    result$bootstrap_sig = nasToDashes(c(" ", "*")[result$bootstrap_sig + 1])
    
    
    result$clean_coef = latexPassFailCell(result$clean_coef, validClean)
    result$clean_pv = lessThanPvCheck(result$clean_pv, result$FSE_pv)
    result$clean_pv = latexPassFailCell(result$clean_pv, validClean)
    #result$adjusted_fdr = lessThanPvCheck01(result$adjusted_fdr)
    result$adjusted_fdr = lessThanPvCheck(result$adjusted_fdr, result$FSE_pv)
    result$adjusted_fdr = latexPassFailCell(result$adjusted_fdr, validFdr)
    #result$adjusted_bonf = lessThanPvCheck01(result$adjusted_bonf)
    result$adjusted_bonf = lessThanPvCheck(result$adjusted_bonf, result$FSE_pv)
    result$adjusted_bonf = latexPassFailCell(result$adjusted_bonf, validBonf)
    result$zeroSum_coef = latexPassFailCell(result$zeroSum_coef, validZeroSum)
    #result$zeroSum_pv = lessThanPvCheck01(result$zeroSum_pv)
    result$zeroSum_pv = lessThanPvCheck(result$zeroSum_pv, result$FSE_pv)
    result$zeroSum_pv = latexPassFailCell(result$zeroSum_pv, validZeroSum)
    result$bootstrap_coef = latexPassFailCell(result$bootstrap_coef, validBootstrap)
    result$bootstrap_sig = latexPassFailCell(result$bootstrap_sig, validBootstrap)
    
    result$FSE_pv = lessThanPv(result$FSE_pv)
    
    result$CACM_coef = NULL
    result$CACM_pv = NULL
    result$repetition_coef = NULL
    result$repetition_pv = NULL

    addtorow <- list()
    addtorow$pos <- list(0, 0, 0, 5)
    addtorow$command <- c(
        "\\rule{0pt}{3ex} & \\multicolumn{2}{c||}{\\normalsize Original Authors}  & \\multicolumn{8}{c}{\\normalsize Reanalysis}\\\\[1mm]",
        #"& \\multicolumn{6}{|c||}{\\normalsize Original} & \\multicolumn{8}{|c}{\\normalsize Reproduction}\\\\[1mm]",
        "& \\multicolumn{2}{c||}{(a) FSE~\\cite{ray14}} & \\multicolumn{2}{c|}{(b) cleaned data} & \\multicolumn{2}{c||}{(c) pV adjusted} & \\multicolumn{2}{c||}{(d) zero-sum} & \\multicolumn{2}{c}{(e) bootstrap}\\\\",
        # FSE                    | CACN                  | repro                  | cleaned                | pvals      | zero Sum              | Bootstrap
        "& Coef & P-val & Coef & P-val & FDR & Bonf & Coef & Bonf & Coef & sig.\\\\",
        "\\hline\n\\hline\n")
    
    t = xtable(result)
    
    align(t) = c("@{}r","||l","l","||l","l","|l","l","||l","l","||l","l@{}")
    print(t, add.to.row = addtorow, include.colnames = F, size = "small", file = "./artifact/permutations/languages_table_reanalysis.tex", sanitize.text.function = latexSanitizer, scalebox = .9, floating = F)
    
}


# Creates the table we use for RQ2 which compares the language classes original, repeated and reclassified
output_RQ2_table = function(data, dataReclassified) {
    baseline =  baselineFSE_RQ2()
    rn = rownames(dataReclassified)
    dataReclassified = dataReclassified %>% mutate(xCoef = coef, xpVal = pVal) %>% dplyr::select(xCoef, xpVal)
    rownames(dataReclassified) = rn
    data = data %>% dplyr::select(coef, pVal)
    x = merge(data, dataReclassified, by = 0, sort = F, all = T)
    rownames(x) = x$Row.names
    x = x %>% dplyr::select(-(Row.names))
    x = merge(x, baseline, by = 0, sort = F, all = T)
    xx = data.frame(
        fseCoef = x$FSE_coef,
        fsePv = x$FSE_pv,
        usCoef = x$coef,
        usPv = round(x$pVal, 3),
        reclassifiedCoef = x$xCoef,
        reclassifiedPv = round(x$xpVal, 3)
    )
    x$Row.names[6:length(x$Row.names)] = paste("\\tt", x$Row.names[6:length(x$Row.names)])
    
    rownames(xx) = x$Row.names
    # now change the values to strings and update them visually
    repetitionPass = checkPValues(xx, "fsePv", "usPv")
    reclassificationPass = checkPValues(xx, "fsePv", "reclassifiedPv")
    xx$fseCoef = nasToDashes(prependTyldeToPositiveValues(xx$fseCoef))
    xx$fsePv = lessThanPv(nasToDashes(xx$fsePv))
    xx$usCoef = latexPassFailCell(nasToDashes(prependTyldeToPositiveValues(xx$usCoef)), repetitionPass)
    xx$usPv = latexPassFailCell(lessThanPvCheck001(nasToDashes(xx$usPv)), repetitionPass)
    xx$reclassifiedCoef = latexPassFailCell(nasToDashes(prependTyldeToPositiveValues(xx$reclassifiedCoef)), reclassificationPass)
    xx$reclassifiedPv = latexPassFailCell(lessThanPvCheck001(nasToDashes(xx$reclassifiedPv)), reclassificationPass)
    addtorow <- list()
    addtorow$pos <- list(0, 0, 5)
    addtorow$command <- c(
        "\\rule{0pt}{3ex} & \\multicolumn{2}{c||}{\\normalsize (a) Original} & \\multicolumn{2}{c}{\\normalsize (b) Repetition} & \\multicolumn{2}{c}{\\normalsize (c) Reclassification}\\\\[1mm]",
        #"\\rule{0pt}{3ex} & \\multicolumn{2}{c||}{\\normalsize Original} & \\multicolumn{2}{c}{\\normalsize Repetition} & \\multicolumn{2}{c}{\\normalsize Reclassification}\\\\[1mm]",
        "& Coef & P-val & Coef & P-val & Coef & P-val \\\\",
        "\\hline\n\\hline\n"
    )
    t = xtable(xx)
    align(t) = c("@{}r", "||l", "l||", "l","l", "|l", "l@{}")
    print(t, add.to.row = addtorow, include.colnames = F, size = "", file = paste0(WORKING_DIR, "/Data/languages_classes_table.tex"), sanitize.text.function = latexSanitizer, floating = F)
}

# Creates the table we use for RQ3 which shows the significance and bug affinity for language domains. 
output_RQ3_table = function(data) {
    # remove the SE column
    data = data %>% dplyr::select(-(se))
    data$coef = prependTyldeToPositiveValues(data$coef)
    
    tD = xtable(data) 
    addToRow = list()
    addToRow$pos = list(0, 5,11)
    addToRow$command = c(
        "\\begin{minipage}{4.5cm}\\begin{tabular}{@{}r|rl@{}}  \\hline \n\\rule{0pt}{3ex} & Coef & p-Val \\\\ \\hline",
        "\\hline\n \\hline\n \\end{tabular}\\end{minipage}&\\begin{minipage}{4.5cm}\\begin{tabular}{@{}r|rl@{}}  \\hline\n \\rule{0pt}{3ex} & Coef & p-Val \\ \\hline \\hline",
        "\\end{tabular}\\end{minipage}"
    )
    
    align(tD) = c("@{}r","|r","l@{}")
    caption(tD) = "Parameter estimates, standard errors, p values, and significance with Bonferroni adjustment."
    label(tD) = "t6"
    print(tD, add.to.row = addToRow, include.colnames = F, size = "scriptsize", sanitize.text.function = latexSanitizer, scalebox = 1, floating = F, include.rownames = T, file = paste0(WORKING_DIR, "/Data/languages_domains_table.tex"))
}


baselineFSE_RQ1 = function() {
    langnames = c("Intercept", "log commits", "log age", "log size", "log devs", "C", "C++", "C#", "Objective-C", "Go", "Java", "Coffeescript", "Javascript", "Typescript",   "Ruby", "Php", "Python", "Perl", "Clojure", "Erlang", "Haskell", "Scala")
    fseEstimate = c(-1.93, 2.26, 0.11, 0.05, 0.16, 0.15, 0.23, 0.03, 0.18, -0.08, -0.01, -0.07, 0.06, -0.43, -0.15, 0.15, 0.10, -0.15, -0.29, -0.0, -0.23, -0.28)
    fseStdErr = c(0.10, 0.03, 0.03, 0.02, 0.03, 0.04, .04, .05, .05, .06, .04, .05, .02, .06, .04, .05, .03, .08, .05, .05, .06, .05)
    fsePval = c(NA, 0.05, 0.01, 0.001)[c(3,3,2,1,3,3,3,0,3,0,0,0,2,3,1,3,2,0,3,0,3,3) + 1]
    names(fseEstimate) = langnames
    data.frame(FSE_coef = fseEstimate, FSE_se = fseStdErr, FSE_pv = fsePval)
}

baselineFSE_RQ1_fixed = function() {
    langnames = c("Intercept", "log commits", "log age", "log size", "log devs", "C", "C++", "C#", "Objective-C", "Go", "Java", "Coffeescript", "Javascript", "Typescript",   "Ruby", "Php", "Python", "Perl", "Clojure", "Erlang", "Haskell", "Scala")
    fseEstimate = c(-1.93, 2.26, 0.11, 0.05, 0.16, 0.15, 0.23, 0.03, 0.18, -0.08, -0.01, -0.07, 0.06, -0.43, -0.15, 0.15, 0.10, -0.15, -0.29, -0.0, -0.23, -0.28)
    fseStdErr = c(0.10, 0.03, 0.03, 0.02, 0.03, 0.04, .04, .05, .05, .06, .04, .05, .02, .06, .04, .05, .03, .08, .05, .05, .06, .05)
    fsePval = c(NA, 0.05, 0.01, 0.001)[c(3,3,2,1,3,3,3,0,3,0,0,0,2,3,1,2,2,0,3,0,3,3) + 1]
    names(fseEstimate) = langnames
    data.frame(FSE_coef = fseEstimate, FSE_se = fseStdErr, FSE_pv = fsePval)
}

baselineFSE_RQ2 = function() {
    langnames = c("Intercept", "log commits", "log age", "log size", "log devs", "Fun Sta Str Man", "Fun Dyn Str Man", "Pro Sta Str Man", "Pro Sta Wea Unm", "Scr Dyn Str Man", "Scr Dyn Wea Man");
    fseEstimate = c(-2.13, 0.96, 0.07, 0.05, 0.07, -0.25, -0.17, -0.06, 0.14, 0.001, 0.04)
    # TODO add stderror !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    fsePVal = c(0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.05, 0.001, NA, 0.05)
    names(fseEstimate) = langnames
    data.frame(FSE_coef =  fseEstimate, FSE_pv = fsePVal)
}


baselineCACM_RQ1 = function() {
    langnames = c("Intercept", "log age", "log size", "log devs", "log commits", "C", "C++", "C#", "Objective-C", "Go", "Java", "Coffeescript", "Javascript", "Typescript", "Ruby", "Php", "Python", "Perl", "Clojure", "Erlang", "Haskell", "Scala")
    cacmEstimate = c(-2.04, .06, .04, .06, .96, .11, .18, -.02, .15, -.11, -.06, .06, .03, .15, -.13, .10, .08, -.12, -.30, -.03, -.26, -.24)
    cacmStdErr = c(.11, .02, .01, .01, .01, .04, .04, .05, .05, .06, .04, .05, .03, .10, .05, .05, .04, .08, .05, .05, .06, .05)
    cacmPVal = c(NA, 0.05, 0.01, 0.001)[c(3,3,3,3,3,2,3,0,2,0,0,0,0,0,2,1,1,0,3,0,3,3) + 1]
    data.frame(CACM_coef = cacmEstimate, CACM_se = cacmStdErr, CACM_pv = cacmPVal, row.names = langnames)
}