Skip to content

Commit

Permalink
Merge pull request #43 from jroussea/jroussea-2
Browse files Browse the repository at this point in the history
bug fix
  • Loading branch information
jroussea authored May 30, 2024
2 parents cc9ec93 + 915e183 commit 1c62739
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 36 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# LAGOON-MCL

[![LAGOON-MCL](https://img.shields.io/badge/LAGOON--MCL-v1.0.2-red?labelColor=000000)](https://jroussea.github.io/LAGOON-MCL/)
[![LAGOON-MCL](https://img.shields.io/badge/LAGOON--MCL-v1.0.3-red?labelColor=000000)](https://jroussea.github.io/LAGOON-MCL/)
[![Documentation Status](https://img.shields.io/badge/docs-latest-green?labelColor=000000)](https://lagoon-mcl-docs.readthedocs.io/en/latest/)
[![Nextflow](https://img.shields.io/badge/nextflow_DSL2-%E2%89%A5_2.10.0-23aa62?labelColor=000000)](https://www.nextflow.io/)
[![Singularity](https://img.shields.io/badge/run_with-singularity-1d355c?labelColor=000000)](https://sylabs.io/singularity/)
Expand Down
28 changes: 6 additions & 22 deletions bin/distribution_homogeneity_score.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,17 @@ LoadDataframe <- function(path_dataframe) {
}


CreationPlots <- function(dataframe, inflation, label) {
CreationPlots <- function(dataframe_input, inflation, label) {

x_labs <- gsub("_", " ", label)

title_labs <- paste("Distribution", x_labs, "- Inflation: ", inflation)

cc_sup_0 <- dataframe %>%
filter(homogeneity_score > 0)
dataframe <- dataframe_input %>%
filter(homogeneity_score != "unannotated")

graph <- dataframe %>%
ggplot(aes(x = homogeneity_score)) +
ggplot(aes(x = as.numeric(homogeneity_score))) +
geom_histogram(bins = 100, color = "darkblue", fill = "lightblue") +
theme_light() +
labs(title = title_labs,
Expand All @@ -49,23 +49,8 @@ CreationPlots <- function(dataframe, inflation, label) {
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12))

title_labs <- paste("Distribution", x_labs, "- Inflation: ", inflation)

zoom <- cc_sup_0 %>%
ggplot(aes(x = homogeneity_score)) +
geom_histogram(bins = 100, color = "darkblue", fill = "lightblue") +
theme_light() +
labs(title = title_labs,
subtitle = "Homogeneity score strictly greater than 0",
x = x_labs,
y = "Number of clusters")+
theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
plot.subtitle = element_text(size = 14, face = "italic"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12))

plot_list = list(graph = graph, zoom = zoom)
plot_list = list(graph = graph)
return(plot_list)
}

Expand All @@ -81,8 +66,7 @@ MainFunction <- function() {

plot_list <- CreationPlots(df_homogeneity_score, args$inflation, args$label)
print(plot_list$graph)
print(plot_list$zoom)


dev.off()
}

Expand Down
54 changes: 41 additions & 13 deletions bin/network_homogeneity_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@

import numpy as np
import pandas as pd
#import decimal
import sys

#decimal.getcontext().prec = 6

def load_dataframe(path_network, path_label):

Expand Down Expand Up @@ -49,7 +51,7 @@ def negative_homogeneity_score(cluster, columns_name, column_peptides):

# Étape 3 dataframes du nombre de Noeuds par label
# à mettre à jour à chaque itération
df_labels = cluster.drop(["CC", "peptides"], axis = 1) \
df_labels = cluster.drop(["CC", column_peptides], axis = 1) \
.assign(size = cluster.groupby(columns_name).transform('size')) \
.drop_duplicates(keep = 'first')

Expand Down Expand Up @@ -78,11 +80,14 @@ def negative_homogeneity_score(cluster, columns_name, column_peptides):
df_labels = df_labels.assign(size = df_labels.groupby("index") \
.transform("size")) \
.drop_duplicates(keep = 'first')


df_labels.rename(columns={'index': columns_name}, inplace = True)

return(list_labels)


def homogeneity_score(cluster, columns_name, cluster_size, column_peptides):
def homogeneity_score(cluster, columns_name, cluster_size, column_peptides,
basename, selection):

cluster.dropna(inplace = True)

Expand All @@ -95,19 +100,30 @@ def homogeneity_score(cluster, columns_name, cluster_size, column_peptides):

elif len(list_label) > 1:

sequence_label = len(cluster[column_peptides].unique().tolist())
# partie de code à vérifier
#sequence_label = len(cluster[column_peptides].unique().tolist())

cc_size = list(cluster_size.loc[cluster_size["CC"] == cc]["CC_size"])[0]

hom_score = 1-(sequence_label/cc_size)
#hom_score = 1-(decimal.Decimal(len(list_label))/decimal.Decimal(cc_size))

#return(hom_score)
hom_score = 1-(len(list_label)/cc_size)

if hom_score < 0:

list_labels = negative_homogeneity_score(cluster, columns_name,
list_label = negative_homogeneity_score(cluster, columns_name,
column_peptides)

hom_score = 1-(len(list_labels)/cc_size)

#hom_score = 1-(decimal.Decimal(len(list_label))/decimal.Decimal(cc_size))

hom_score = 1-(len(list_label)/cc_size)


for label in list_label:

with open(f"{basename}_{selection}.txt", 'a', encoding = "utf8") as f:

f.write(f"{cc}\t{label}\n")

return(hom_score)

Expand Down Expand Up @@ -156,24 +172,27 @@ def main(path_network, path_label, column_peptides, inflation, basename):
homogeneity_score(cluster,
columns_name,
cluster_size,
column_peptides))
column_peptides, basename,
"all"))

df_homogeneity_score_annotated = network_label.groupby("CC",
as_index = False) \
.apply(lambda cluster: \
homogeneity_score(cluster,
columns_name,
cluster_size_annotated,
column_peptides))
column_peptides,
basename,
"annotated"))


df_homogeneity_score = df_homogeneity_score \
.merge(cluster_size, on = "CC", how = "right") \
.replace(np.nan, 0)
.replace(np.nan, "unannotated")

df_homogeneity_score_annotated = df_homogeneity_score_annotated \
.merge(cluster_size_annotated, on = "CC", how = "right") \
.replace(np.nan, 0)
.replace(np.nan, "unannotated")

save_dataframe(df_homogeneity_score, "all", inflation, basename)
save_dataframe(df_homogeneity_score_annotated, "annotated", inflation,
Expand All @@ -195,3 +214,12 @@ def main(path_network, path_label, column_peptides, inflation, basename):
#basename = "label_interproscan"

main(path_network, path_label, column_peptides, inflation, basename)


#path_network = "network_I2.tsv"
#path_label = "label_SMART.tsv"
#column_peptides = "peptides"
#inflation = 2
#basename = "label_SMART"

#main(path_network, path_label, column_peptides, inflation, basename)

0 comments on commit 1c62739

Please sign in to comment.