diff --git a/README.md b/README.md index 076f460..b8ea763 100644 --- a/README.md +++ b/README.md @@ -45,10 +45,6 @@ Installation only requires a few minutes. #### Run SCALE.py -d [input] - -if cluster number k is known: - - SCALE.py -d [input] -k [k] #### Output Output will be saved in the output folder including: @@ -69,7 +65,7 @@ or get numerical imputed data in adata.h5ad file using scanpy **adata.obsm['impu * save results in a specific folder: [-o] or [--outdir] * embed feature by tSNE or UMAP: [--embed] tSNE/UMAP * filter low quality cells by valid peaks number, default 100: [--min_peaks] -* filter low quality peaks by valid cells number, default 0.01: [--min_cells] +* filter low quality peaks by valid cells number, default 3: [--min_cells] * filter peaks by selecting highly variable features, default 100,000: [--n_feature], disable by [--n_feature] -1. * modify the initial learning rate, default is 0.002: [--lr] * change iterations by watching the convergence of loss, default is 30000: [-i] or [--max_iter] diff --git a/SCALE.py b/SCALE.py index 02861e3..7a723de 100755 --- a/SCALE.py +++ b/SCALE.py @@ -14,23 +14,9 @@ """ -import time -import torch - -import numpy as np -import pandas as pd -import os -import scanpy as sc import argparse -from scale import SCALE -from scale.dataset import load_dataset -from scale.utils import read_labels, cluster_report, estimate_k, binarization -from scale.plot import plot_embedding - -from sklearn.preprocessing import MaxAbsScaler -from sklearn.cluster import KMeans -from torch.utils.data import DataLoader +from scale import SCALE_function if __name__ == '__main__': @@ -49,7 +35,7 @@ parser.add_argument('--decode_dim', type=int, nargs='*', default=[], help='encoder structure') parser.add_argument('--latent', '-l',type=int, default=10, help='latent layer dim') parser.add_argument('--min_peaks', type=float, default=100, help='Remove low quality cells with few peaks') - parser.add_argument('--min_cells', type=float, default=0.01, help='Remove low quality peaks') + parser.add_argument('--min_cells', type=float, default=3, help='Remove low quality peaks') parser.add_argument('--n_feature', type=int, default=100000, help='Keep the number of highly variable peaks') parser.add_argument('--log_transform', action='store_true', help='Perform log2(x+1) transform') parser.add_argument('--max_iter', '-i', type=int, default=30000, help='Max iteration') @@ -62,109 +48,30 @@ args = parser.parse_args() - # Set random seed - seed = args.seed - np.random.seed(seed) - torch.manual_seed(seed) - - if torch.cuda.is_available(): # cuda device - device='cuda' - torch.cuda.set_device(args.gpu) - else: - device='cpu' - batch_size = args.batch_size - - print("\n**********************************************************************") - print(" SCALE: Single-Cell ATAC-seq Analysis via Latent feature Extraction") - print("**********************************************************************\n") - - adata, trainloader, testloader = load_dataset( - args.data_list, - batch_categories=None, - join='inner', - batch_key='batch', - batch_name='batch', - min_genes=args.min_peaks, - min_cells=args.min_cells, - batch_size=args.batch_size, - n_top_genes=args.n_feature, - log=None, + adata = SCALE_function( + args.data_list, + n_centroids = args.n_centroids, + outdir = args.outdir, + verbose = args.verbose, + pretrain = args.pretrain, + lr = args.lr, + batch_size = args.batch_size, + gpu = args.gpu, + seed = args.seed, + encode_dim = args.encode_dim, + decode_dim = args.decode_dim, + latent = args.latent, + min_peaks = args.min_peaks, + min_cells = args.min_cells, + n_feature = args.n_feature, + log_transform = args.log_transform, + max_iter = args.max_iter, + weight_decay = args.weight_decay, + impute = args.impute, + binary = args.binary, + embed = args.embed, + reference = args.reference, + cluster_method = args.cluster_method, ) - cell_num = adata.shape[0] - input_dim = adata.shape[1] - -# if args.n_centroids is None: -# k = estimate_k(adata.X.T) -# print('Estimate k = {}'.format(k)) -# else: -# k = args.n_centroids - lr = args.lr - k = args.n_centroids - - outdir = args.outdir+'/' - if not os.path.exists(outdir): - os.makedirs(outdir) - - print("\n======== Parameters ========") - print('Cell number: {}\nPeak number: {}\nn_centroids: {}\nmax_iter: {}\nbatch_size: {}\ncell filter by peaks: {}\npeak filter by cells: {}'.format( - cell_num, input_dim, k, args.max_iter, batch_size, args.min_peaks, args.min_cells)) - print("============================") - - dims = [input_dim, args.latent, args.encode_dim, args.decode_dim] - model = SCALE(dims, n_centroids=k) - print(model) - - if not args.pretrain: - print('\n## Training Model ##') - model.init_gmm_params(testloader) - model.fit(trainloader, - lr=lr, - weight_decay=args.weight_decay, - verbose=args.verbose, - device = device, - max_iter=args.max_iter, -# name=name, - outdir=outdir - ) - torch.save(model.state_dict(), os.path.join(outdir, 'model.pt')) # save model - else: - print('\n## Loading Model: {}\n'.format(args.pretrain)) - model.load_model(args.pretrain) - model.to(device) - - ### output ### - print('outdir: {}'.format(outdir)) - # 1. latent feature - adata.obsm['latent'] = model.encodeBatch(testloader, device=device, out='z') - - # 2. cluster - sc.pp.neighbors(adata, n_neighbors=30, use_rep='latent') - if args.cluster_method == 'leiden': - sc.tl.leiden(adata) - elif args.cluster_method == 'kmeans': - kmeans = KMeans(n_clusters=k, n_init=20, random_state=0) - adata.obs['kmeans'] = kmeans.fit_predict(adata.obsm['latent']).astype(str) - -# if args.reference in adata.obs: -# cluster_report(adata.obs[args.reference].cat.codes, adata.obs[args.cluster_method].astype(int)) - - sc.settings.figdir = outdir - sc.set_figure_params(dpi=80, figsize=(6,6), fontsize=10) - if args.embed == 'UMAP': - sc.tl.umap(adata, min_dist=0.1) - color = [c for c in ['celltype', args.cluster_method] if c in adata.obs] - sc.pl.umap(adata, color=color, save='.png', wspace=0.4, ncols=4) - elif args.embed == 'tSNE': - sc.tl.tsne(adata, use_rep='latent') - color = [c for c in ['celltype', args.cluster_method] if c in adata.obs] - sc.pl.tsne(adata, color=color, save='.png', wspace=0.4, ncols=4) - - if args.impute: - adata.obsm['impute'] = model.encodeBatch(testloader, device=device, out='x') - if args.binary: - adata.obsm['impute'] = model.encodeBatch(testloader, device=device, out='x') - adata.obsm['binary'] = binarization(adata.obsm['impute'], adata.X) - del adata.obsm['impute'] - - adata.write(outdir+'adata.h5ad', compression='gzip') + \ No newline at end of file diff --git a/scale/__init__.py b/scale/__init__.py index 813addb..b68ef11 100755 --- a/scale/__init__.py +++ b/scale/__init__.py @@ -16,3 +16,230 @@ from .loss import * +#!/usr/bin/env python +""" +# Author: Xiong Lei +# Created Time : Thu 31 May 2018 08:45:33 PM CST + +# File Name: __init__.py +# Description: + +""" + +__author__ = "Lei Xiong" +__email__ = "jsxlei@gmail.com" + +from .layer import * +from .model import * +from .loss import * +from .dataset import load_dataset +from .utils import estimate_k, binarization + + +import time +import torch + +import numpy as np +import pandas as pd +import os +import scanpy as sc +from anndata import AnnData + +from typing import Union, List + +def SCALE_function( + data_list:Union[str, List], + n_centroids:int = 30, + outdir:bool = None, + verbose:bool = False, + pretrain:str = None, + lr:float = 0.0002, + batch_size:int = 64, + gpu:int = 0, + seed:int = 18, + encode_dim:List = [1024, 128], + decode_dim:List = [], + latent:int = 10, + min_peaks:int = 100, + min_cells:Union[float, int] = 3, + n_feature:int = 100000, + log_transform:bool = False, + max_iter:int = 30000, + weight_decay:float = 5e-4, + impute:bool = False, + binary:bool = False, + embed:str = 'UMAP', + reference:str = 'cell_type', + cluster_method:str = 'leiden', + )->AnnData: + + """ + Use SCALE [Xiong19]_ to cluster and impute on scATAC-seq + + Parameters + ---------- + data_list + A path of input data, could be AnnData, or file in format of h5ad, txt, csv or dir containing mtx file + n_centroids + Initial n_centroids, default is 30 + outdir + output of dir where results will be saved, if None, will return an AnnData object + verbose + Verbosity, default is False + pretrain + Use the pretrained model to project on new data or repeat results + lr + learning rate for training model + batch_size + batch_size of one iteration for training model + gpu + Use id of gpu device + seed + Random seed + encode_dim + encode architecture + decode_dim + decode architecture + latent + dimensions of latent + min_peaks + min peaks for filtering cells + min_cells + min cells for filtering peaks, will be ratio if small than 1, default is 3 + n_feature + number of the most highly variable peaks will be used for training model + log_transform + log transform the data, recommended for scRNA-seq data + max_iter + Max iteration for training model + weight_decay + weight decay for training model + impute + output the imputed data in adata if true + binary + Change the imputed data to binary format + embed + Embedding method, UMAP or tSNE, default is UMAP + reference + reference annotations for evaluation, default is cell_type + cluster_method + cluster method, default is Leiden + + Returns + ------- + adata + An AnnData object + + + Example + ------- + >>> from scale import SCALE_function + >>> adata = sc.read('Your/scATAC/Data') + >>> adata = SCALE_function(adata) + + if want imputed data + + >>> adata = SCALE_function(adata, impute=True) + + imputed data will be saved in adata.obsm['impute'] + binary version of imputed data will be saved in adata.obsm['binary'] + + )->AnnData: + """ + np.random.seed(seed) + torch.manual_seed(seed) + + if torch.cuda.is_available(): # cuda device + device='cuda' + torch.cuda.set_device(gpu) + else: + device='cpu' + + print("\n**********************************************************************") + print(" SCALE: Single-Cell ATAC-seq Analysis via Latent feature Extraction") + print("**********************************************************************\n") + + + adata, trainloader, testloader = load_dataset( + data_list, + min_genes=min_peaks, + min_cells=min_cells, + n_top_genes=n_feature, + batch_size=batch_size, + log=None, + ) + + cell_num = adata.shape[0] + input_dim = adata.shape[1] + + k = n_centroids + + if outdir: + outdir = outdir+'/' + if not os.path.exists(outdir): + os.makedirs(outdir) + print('outdir: {}'.format(outdir)) + + print("\n======== Parameters ========") + print('Cell number: {}\nPeak number: {}\nmax_iter: {}\nbatch_size: {}\ncell filter by peaks: {}\npeak filter by cells: {}'.format( + cell_num, input_dim, max_iter, batch_size, min_peaks, min_cells)) + print("============================") + + latent = 10 + encode_dim = [1024, 128] + decode_dim = [] + dims = [input_dim, latent, encode_dim, decode_dim] + model = SCALE(dims, n_centroids=k) + # print(model) + + if not pretrain: + print('\n## Training Model ##') + model.init_gmm_params(testloader) + model.fit(trainloader, + lr=lr, + verbose= verbose, + device = device, + max_iter= max_iter, + outdir=outdir + ) + if outdir: + torch.save(model.state_dict(), os.path.join(outdir, 'model.pt')) # save model + else: + print('\n## Loading Model: {}\n'.format(pretrain)) + model.load_model(pretrain) + model.to(device) + + ### output ### + + # 1. latent feature + adata.obsm['latent'] = model.encodeBatch(testloader, device=device, out='z') + + # 2. cluster + sc.pp.neighbors(adata, n_neighbors=30, use_rep='latent') + sc.tl.leiden(adata) + + + sc.set_figure_params(dpi=80, figsize=(6,6), fontsize=10) + if outdir: + sc.settings.figdir = outdir + save = '.png' + else: + save = None + if embed == 'UMAP': + sc.tl.umap(adata, min_dist=0.1) + color = [c for c in ['celltype', 'leiden', 'cell_type'] if c in adata.obs] + sc.pl.umap(adata, color=color, save=save, show=False, wspace=0.4, ncols=4) + elif embed == 'tSNE': + sc.tl.tsne(adata, use_rep='latent') + color = [c for c in ['celltype', 'leiden', 'cell_type'] if c in adata.obs] + sc.pl.tsne(adata, color=color, save=save, show=False, wspace=0.4, ncols=4) + + if impute: + print("Imputation") + adata.obsm['impute'] = model.encodeBatch(testloader, device=device, out='x') + adata.obsm['binary'] = binarization(adata.obsm['impute'], adata.X) + + if outdir: + adata.write(outdir+'adata.h5ad', compression='gzip') + + return adata \ No newline at end of file diff --git a/scale/model.py b/scale/model.py index 3d0e9e7..d419ad7 100755 --- a/scale/model.py +++ b/scale/model.py @@ -117,7 +117,7 @@ def fit(self, dataloader, max_iter=30000, verbose=True, patience=100, - outdir='./' + outdir=None, ): self.to(device) @@ -238,7 +238,7 @@ def adjust_learning_rate(init_lr, optimizer, iteration): import os class EarlyStopping: """Early stops the training if loss doesn't improve after a given patience.""" - def __init__(self, patience=10, verbose=False, outdir='./'): + def __init__(self, patience=10, verbose=False, outdir=None): """ Args: patience (int): How long to wait after last time loss improved. @@ -252,7 +252,7 @@ def __init__(self, patience=10, verbose=False, outdir='./'): self.best_score = None self.early_stop = False self.loss_min = np.Inf - self.model_file = os.path.join(outdir, 'model.pt') + self.model_file = os.path.join(outdir, 'model.pt') if outdir else None def __call__(self, loss, model): if np.isnan(loss): @@ -278,5 +278,6 @@ def save_checkpoint(self, loss, model): '''Saves model when loss decrease.''' if self.verbose: print(f'Loss decreased ({self.loss_min:.6f} --> {loss:.6f}). Saving model ...') - torch.save(model.state_dict(), self.model_file) + if self.model_file: + torch.save(model.state_dict(), self.model_file) self.loss_min = loss diff --git a/setup.py b/setup.py index 8b2edcb..fc4c6f2 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ requirements = f.read().splitlines() setup(name='scale', - version='1.1.1', + version='1.1.2', packages=find_packages(), description='Single-Cell ATAC-seq Analysis via Latent feature Extraciton', long_description='',