Merge pull request #43 from jroussea/jroussea-2

bug fix
jroussea · May 30, 2024 · 1c62739 · 1c62739
2 parents cc9ec93 + 915e183
commit 1c62739
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # LAGOON-MCL
 
-[![LAGOON-MCL](https://img.shields.io/badge/LAGOON--MCL-v1.0.2-red?labelColor=000000)](https://jroussea.github.io/LAGOON-MCL/)
+[![LAGOON-MCL](https://img.shields.io/badge/LAGOON--MCL-v1.0.3-red?labelColor=000000)](https://jroussea.github.io/LAGOON-MCL/)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-green?labelColor=000000)](https://lagoon-mcl-docs.readthedocs.io/en/latest/)
 [![Nextflow](https://img.shields.io/badge/nextflow_DSL2-%E2%89%A5_2.10.0-23aa62?labelColor=000000)](https://www.nextflow.io/)
 [![Singularity](https://img.shields.io/badge/run_with-singularity-1d355c?labelColor=000000)](https://sylabs.io/singularity/)

diff --git a/bin/distribution_homogeneity_score.R b/bin/distribution_homogeneity_score.R
@@ -30,17 +30,17 @@ LoadDataframe <- function(path_dataframe) {
 }
 
 
-CreationPlots <- function(dataframe, inflation, label) {
+CreationPlots <- function(dataframe_input, inflation, label) {
 
   x_labs <- gsub("_", " ", label)
 
   title_labs <- paste("Distribution", x_labs, "- Inflation: ", inflation)
 
-  cc_sup_0 <- dataframe %>% 
-    filter(homogeneity_score > 0)
+  dataframe <- dataframe_input %>% 
+    filter(homogeneity_score != "unannotated")
 
   graph <- dataframe %>%
-    ggplot(aes(x = homogeneity_score)) +
+    ggplot(aes(x = as.numeric(homogeneity_score))) +
     geom_histogram(bins = 100, color = "darkblue", fill = "lightblue") +
     theme_light() +
     labs(title = title_labs,
@@ -49,23 +49,8 @@ CreationPlots <- function(dataframe, inflation, label) {
     theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
           axis.title = element_text(size = 14),
           axis.text = element_text(size = 12))
-
-  title_labs <- paste("Distribution", x_labs, "- Inflation: ", inflation)
-
-  zoom <- cc_sup_0 %>% 
-    ggplot(aes(x = homogeneity_score)) +
-    geom_histogram(bins = 100, color = "darkblue", fill = "lightblue") +
-    theme_light() +
-    labs(title = title_labs,
-         subtitle = "Homogeneity score strictly greater than 0",
-         x = x_labs,
-         y = "Number of clusters")+
-    theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
-          plot.subtitle = element_text(size = 14, face = "italic"),
-          axis.title = element_text(size = 14),
-          axis.text = element_text(size = 12))
 
-  plot_list = list(graph = graph, zoom = zoom)
+  plot_list = list(graph = graph)
   return(plot_list)
 }
 
@@ -81,8 +66,7 @@ MainFunction <- function() {
 
   plot_list <- CreationPlots(df_homogeneity_score, args$inflation, args$label)
   print(plot_list$graph)
-  print(plot_list$zoom)
-
+
   dev.off()
 }
 

diff --git a/bin/network_homogeneity_score.py b/bin/network_homogeneity_score.py
@@ -8,8 +8,10 @@
 
 import numpy as np
 import pandas as pd
+#import decimal
 import sys
 
+#decimal.getcontext().prec = 6
 
 def load_dataframe(path_network, path_label):
 
@@ -49,7 +51,7 @@ def negative_homogeneity_score(cluster, columns_name, column_peptides):
 
     # Étape 3 dataframes du nombre de Noeuds par label
     # à mettre à jour à chaque itération
-    df_labels = cluster.drop(["CC", "peptides"], axis = 1) \
+    df_labels = cluster.drop(["CC", column_peptides], axis = 1) \
         .assign(size = cluster.groupby(columns_name).transform('size')) \
             .drop_duplicates(keep = 'first')
 
@@ -78,11 +80,14 @@ def negative_homogeneity_score(cluster, columns_name, column_peptides):
         df_labels = df_labels.assign(size = df_labels.groupby("index") \
                                      .transform("size")) \
             .drop_duplicates(keep = 'first')
-
+
+        df_labels.rename(columns={'index': columns_name}, inplace = True)
+
     return(list_labels)
 
 
-def homogeneity_score(cluster, columns_name, cluster_size, column_peptides):
+def homogeneity_score(cluster, columns_name, cluster_size, column_peptides, 
+                      basename, selection):
 
     cluster.dropna(inplace = True)
 
@@ -95,19 +100,30 @@ def homogeneity_score(cluster, columns_name, cluster_size, column_peptides):
 
     elif len(list_label) > 1:
 
-        sequence_label = len(cluster[column_peptides].unique().tolist())
+        # partie de code à vérifier
+        #sequence_label = len(cluster[column_peptides].unique().tolist())
 
         cc_size = list(cluster_size.loc[cluster_size["CC"] == cc]["CC_size"])[0]
 
-        hom_score = 1-(sequence_label/cc_size)
+        #hom_score = 1-(decimal.Decimal(len(list_label))/decimal.Decimal(cc_size))
 
-    #return(hom_score)
+        hom_score = 1-(len(list_label)/cc_size)
+
         if hom_score < 0:
 
-            list_labels = negative_homogeneity_score(cluster, columns_name,
+            list_label = negative_homogeneity_score(cluster, columns_name,
                                                      column_peptides)
-
-            hom_score = 1-(len(list_labels)/cc_size)
+
+            #hom_score = 1-(decimal.Decimal(len(list_label))/decimal.Decimal(cc_size))
+
+            hom_score = 1-(len(list_label)/cc_size)
+
+
+    for label in list_label:
+
+        with open(f"{basename}_{selection}.txt", 'a', encoding = "utf8") as f:
+
+            f.write(f"{cc}\t{label}\n")
 
     return(hom_score)
 
@@ -156,24 +172,27 @@ def main(path_network, path_label, column_peptides, inflation, basename):
                                homogeneity_score(cluster, 
                                                  columns_name,
                                                  cluster_size,
-                                                 column_peptides))
+                                                 column_peptides, basename,
+                                                 "all"))
 
     df_homogeneity_score_annotated = network_label.groupby("CC", 
                                                            as_index = False) \
                         .apply(lambda cluster: \
                                homogeneity_score(cluster, 
                                                  columns_name, 
                                                  cluster_size_annotated,
-                                                 column_peptides))
+                                                 column_peptides,
+                                                 basename,
+                                                 "annotated"))
 
 
     df_homogeneity_score = df_homogeneity_score \
         .merge(cluster_size, on = "CC", how = "right") \
-            .replace(np.nan, 0)
+            .replace(np.nan, "unannotated")
 
     df_homogeneity_score_annotated = df_homogeneity_score_annotated \
         .merge(cluster_size_annotated, on = "CC", how = "right") \
-            .replace(np.nan, 0)
+            .replace(np.nan, "unannotated")
 
     save_dataframe(df_homogeneity_score, "all", inflation, basename)
     save_dataframe(df_homogeneity_score_annotated, "annotated", inflation, 
@@ -195,3 +214,12 @@ def main(path_network, path_label, column_peptides, inflation, basename):
     #basename = "label_interproscan"
 
     main(path_network, path_label, column_peptides, inflation, basename)
+
+
+#path_network = "network_I2.tsv"
+#path_label = "label_SMART.tsv"
+#column_peptides = "peptides"
+#inflation = 2
+#basename = "label_SMART"
+
+#main(path_network, path_label, column_peptides, inflation, basename)