From bb3c4d04726455a1e9798eba7aaacef0c2c6c95c Mon Sep 17 00:00:00 2001 From: sreemol-gokuladhas Date: Sat, 18 Jun 2022 20:07:30 +1200 Subject: [PATCH 1/2] Added new feature: PCHi-C data analysis --- codes3d/codes3d.py | 258 +++++---- codes3d/genes.py | 264 ++++++--- codes3d/interactions.py | 173 ++++-- codes3d/snps.py | 502 ++++++++++++------ codes3d/summary.py | 108 ++-- .../create_index_pchic_sqlite3_db.py | 31 ++ data_preparation/init_pchic_gene_tables.py | 78 +++ data_preparation/init_pchic_meta.py | 31 ++ data_preparation/init_pchic_sqlite_db.py | 155 ++++++ .../init_variant_fragment_table_pchic.py | 98 ++++ .../tidy_promoter_baitmap_file.py | 56 ++ docs/codes3d.conf | 2 +- 12 files changed, 1321 insertions(+), 435 deletions(-) create mode 100644 data_preparation/create_index_pchic_sqlite3_db.py create mode 100644 data_preparation/init_pchic_gene_tables.py create mode 100755 data_preparation/init_pchic_meta.py create mode 100644 data_preparation/init_pchic_sqlite_db.py create mode 100755 data_preparation/init_variant_fragment_table_pchic.py create mode 100644 data_preparation/tidy_promoter_baitmap_file.py diff --git a/codes3d/codes3d.py b/codes3d/codes3d.py index ef4e1b4..d8965fd 100755 --- a/codes3d/codes3d.py +++ b/codes3d/codes3d.py @@ -12,7 +12,6 @@ from sqlalchemy.pool import NullPool import tqdm import statsmodels.stats.multitest as multitest - import snps import genes import interactions @@ -52,10 +51,10 @@ def parse_tissues(user_tissues, match_tissues, eqtl_project, db): else: matched_df.append(u_df) matched_tissues.append(u_tissue) - error_msg = 'Program aborting:\n\t{}\nnot found in database.' + error_msg = 'Program aborting: {} not found in database.' if (len(matched_df) == 0 or len(not_tissues) > 0) and len(to_omit) == 0: print(error_msg.format('\n\t'.join(not_tissues))) - print('\nPlease use one of the following. ' + + print('\nPlease use one of the following with -t flag. ' + 'Tissue names are case sensitive:') list_eqtl_tissues(db) sys.exit() @@ -93,7 +92,8 @@ def parse_hic( include_cell_line, exclude_cell_line, restriction_enzymes, - db): + db, + pchic=False): ''' user parameters -r, -n and -x. Args: @@ -103,9 +103,13 @@ def parse_hic( exclude_cell_line: space-delimited list of cell_lines from -x Returns: hic_df: a dataframe columns(library, enzyme, rep_count) + pchic_df: a dataframe columns(library, enzyme, rep_count) ''' - - sql = '''SELECT library, tags, enzyme, rep_count FROM meta_hic''' + if not args.pchic: + sql = '''SELECT library, tags, enzyme, rep_count FROM meta_hic''' + else: + sql = '''SELECT library, tags, enzyme, rep_count FROM meta_pchic''' + df = pd.DataFrame() with db.connect() as con: df = pd.read_sql(sql, con=con) @@ -122,6 +126,9 @@ def parse_hic( (df['tags'].str.contains( r'\b{}\b'.format(u_tissue), case=False)) ] + #u_df = df[ + # df['library'].str.contains(u_tissue,case=False) | + # df['tags'].str.contains(u_tissue,case=False)] if u_df.empty: if u_tissue.startswith(u_tissue): to_omit.append(u_tissue) @@ -134,7 +141,7 @@ def parse_hic( if (len(matched_df) == 0 or len(not_tissues) > 0) and len(to_omit) == 0: print(error_msg.format('\n\t'.join(not_tissues))) print(('Use -t and -n to include specific eQTL tissues' - ' and Hi-C libraries. Library names are case sensitive:')) + ' and Hi-C/PCHi-C libraries. Library names are case sensitive:')) sys.exit('\n\t{}'.format('\n\t'.join(df['library'].tolist()))) if len(matched_df) == 0 and len(to_omit) > 0: hic_df = df @@ -150,7 +157,6 @@ def parse_hic( ~hic_df['tags'].str.contains( r'\b{}\b'.format(to_omit[i][1:]), case=False)] hic_df = hic_df.drop_duplicates() - elif include_cell_line and len(include_cell_line) > 0: validate_input(include_cell_line, df['library']) hic_df = df[df['library'].str.upper().isin( @@ -170,7 +176,6 @@ def parse_hic( else: return df.drop_duplicates() - def validate_input(input_params, params): input_params_upper = [c.upper() for c in input_params] not_found = set(input_params_upper).difference( @@ -303,8 +308,8 @@ def map_non_spatial_eqtls( commons_db, covariates_dir, expression_dir, - logger): - + logger, + pchic = False): gene_info_df = None gene_list = None if not snp_df.empty: @@ -312,12 +317,13 @@ def map_non_spatial_eqtls( print('No gene list given. Will map across all genes.') else: gene_info_df = genes.get_gene_info( - args.gene_list, - hic_df, - args.output_dir, - commons_db, - logger, - args.suppress_intermediate_files) + args.gene_list, + hic_df, + args.output_dir, + commons_db, + logger, + pchic, + args.suppress_intermediate_files) gene_info_df = gene_info_df.rename(columns={ 'name': 'gene', 'chrom': 'gene_chrom', @@ -409,7 +415,7 @@ def map_non_spatial_eqtls( ''' fp = os.path.join(args.output_dir, 'non_spatial_eqtls.txt') eqtl_df[cols].to_csv(fp, sep='\t', index=False) - logger.write(f'Output written to {fp}') + logger.write(f'Output written to {fp}.') msg = 'Done.\nTotal time elasped: {:.2f} mins.'.format( (time.time() - start_time)/60) logger.write(msg) @@ -426,7 +432,8 @@ def map_spatial_eqtls( genotypes_fp, eqtl_project_db, covariates_dir, - expression_dir): + expression_dir, + pchic=False): gene_df = [] eqtl_df = [] batchsize = 2000 @@ -447,14 +454,15 @@ def map_spatial_eqtls( 'snp', C.lib_dir, hic_df, - # batch_output_dir, args.num_processes, - logger) + logger, + pchic) batch_gene_df = genes.get_gene_by_id( - batch_snp_df, - batch_interactions_df, - commons_db, - logger) + batch_snp_df, + batch_interactions_df, + commons_db, + logger, + pchic) gene_df.append(batch_gene_df) if batch_gene_df.empty: continue @@ -542,8 +550,6 @@ def __init__(self, config_fp): self.hic_restriction_enzymes = [ e.strip() for e in os.listdir(self.hic_dir)] - - def parse_args(): parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, @@ -568,6 +574,9 @@ def parse_args(): parser.add_argument( '-o', '--output-dir', help='The directory in which to output results.') + parser.add_argument( + '--pchic', action='store_true', default=False, + help='''Use this flag to use pchic datasets instead of hic.''') parser.add_argument( '--multi-test', default = 'multi', help='''Options for BH multiple-testing: ['snp', 'tissue', 'multi']. @@ -626,7 +635,7 @@ def parse_args(): Default is all tissues from the GTEx project. Use 'codes3d.py --list-eqtl-tissues' for a list of installed tissues.''') parser.add_argument( - '--eqtl-project', type=str, default=None, + '--eqtl-project', type=str, default='GTEx', help='''The eQTL project to query. Default: GTEx. 'use \'codes3d.py --list-eqtl-db\' to list available databases.''') parser.add_argument( @@ -729,14 +738,33 @@ def log_settings(args, logger): logger.write(f'FDR threshold:\t{args.fdr_threshold}') logger.write(f'MAF threshold:\t{args.maf_threshold}') logger.write(f'Effect size:\t{effect_size}') - if not args.match_tissues and not args.include_cell_lines and \ - not args.exclude_cell_lines: + + if not args.pchic and not args.non_spatial: + logger.write('Using Hi-C datasets\n') + else: + if args.pchic and not args.non_spatial: + logger.write('Using PCHi-C datasets\n') + + if not args.pchic and not args.match_tissues and not args.include_cell_lines and \ + not args.exclude_cell_lines: if not args.non_spatial: logger.write('Hi-C libraries:\tAll libraries in database') else: - if not args.non_spatial: + if not args.pchic and not args.non_spatial: logger.write('Hi-C libraries:\t{}'.format( ', '.join(hic_df['library'].tolist()))) + + + if args.pchic and not args.match_tissues and not args.include_cell_lines and \ + not args.exclude_cell_lines: + if not args.non_spatial: + logger.write('PCHi-C libraries:\tAll libraries in database') + else: + if args.pchic and not args.non_spatial: + logger.write('PCHi-C libraries:\t{}'.format( + ', '.join(hic_df['library'].tolist()))) + + if not args.tissues and not args.match_tissues: logger.write('eQTL tissues:\tAll tissues in database\n') else: @@ -770,24 +798,42 @@ def log_settings(args, logger): args.include_cell_lines, args.exclude_cell_lines, args.restriction_enzymes, - commons_db) - if args.match_tissues and not args.non_spatial: - logger.write('Hi-C libraries:\t{}'.format( + commons_db, + args.pchic) + + if not args.pchic and args.match_tissues and not args.non_spatial: + logger.write('Using Hi-C libraries:\t{}'.format( ', '.join(hic_df['library'].tolist()))) logger.write('\neQTL tissues:\t{}\n'.format( ', '.join(tissues['name'].tolist()))) - - upsert = input( + + upsert = input( '''WARNING: We've tried to match your Hi-C and eQTL tissues above. - Continue? [y/N]''' - ) + Continue? [y/n]''') if not upsert.lower() == 'y': print(('Use -t and -n to include specific eQTL tissues' ' and Hi-C libraries')) sys.exit('Exiting.') + + if args.pchic and args.match_tissues and not args.non_spatial: + logger.write('Using PCHi-C libraries:\t{}'.format( + ', '.join(hic_df['library'].tolist()))) + logger.write('\neQTL tissues:\t{}\n'.format( + ', '.join(tissues['name'].tolist()))) + + upsert = input( + '''WARNING: We've tried to match your PCHi-C and eQTL tissues above. + Continue? [y/n]''') + if not upsert.lower() == 'y': + print(('Use -t and -n to include specific eQTL tissues' + ' and PCHi-C libraries')) + sys.exit('Exiting.') + log_settings(args, logger) + eqtl_project = tissues['project'].tolist()[0] + config = configparser.ConfigParser() config.read(args.config) covariates_dir = os.path.join( @@ -803,6 +849,7 @@ def log_settings(args, logger): snp_df = pd.DataFrame() gene_df = [] eqtl_df = [] + if args.gene_input: gene_info_df = genes.get_gene_info( args.gene_input, @@ -810,6 +857,7 @@ def log_settings(args, logger): args.output_dir, commons_db, logger, + args.pchic, args.suppress_intermediate_files) if args.non_spatial: map_non_spatial_eqtls( @@ -824,33 +872,36 @@ def log_settings(args, logger): commons_db, covariates_dir, expression_dir, - logger) + logger, + args.pchic) interactions_df = interactions.find_interactions( gene_info_df, 'gencode_id', C.lib_dir, hic_df, - #args.output_dir, args.num_processes, logger, + args.pchic, args.suppress_intermediate_files) snp_df, gene_df, eqtl_df = snps.find_snps( - interactions_df, - gene_info_df, - tissues, - args.output_dir, - C, - genotypes_fp, - eqtl_project_db, - covariates_dir, - expression_dir, - args.pval_threshold, - args.maf_threshold, - args.fdr_threshold, - args.num_processes, - commons_db, - logger, - args.suppress_intermediate_files) + interactions_df, + gene_info_df, + tissues, + args.output_dir, + C, + genotypes_fp, + eqtl_project_db, + covariates_dir, + expression_dir, + args.pval_threshold, + args.maf_threshold, + args.fdr_threshold, + args.num_processes, + commons_db, + logger, + args.pchic, + args.suppress_intermediate_files) + if gene_df.empty or snp_df.empty or eqtl_df.empty: logger.write('No eQTLs found.\nProgram exiting.') sys.exit() @@ -859,27 +910,32 @@ def log_settings(args, logger): args.output_dir, 'snps.txt'), sep='\t', index=False) gene_df.to_csv(os.path.join( args.output_dir, 'genes.txt'), sep='\t', index=False) + eqtl_df.to_csv(os.path.join( + args.output_dir, 'eqtls.txt'), sep='\t', index=False) + if args.snp_input or args.snps_within_gene: interactions_df = [] gene_df = [] gene_info_df = None if args.snps_within_gene: gene_info_df = genes.get_gene_info( - args.snps_within_gene, - hic_df, - args.output_dir, - commons_db, - logger, - args.suppress_intermediate_files) + args.snps_within_gene, + hic_df, + args.output_dir, + commons_db, + logger, + args.pchic, + args.suppress_intermediate_files) snp_df = snps.get_snp( - args.snp_input, - gene_info_df, - hic_df, - args.output_dir, - eqtl_project_db, - C.rs_merge_arch_fp, - logger, - args.suppress_intermediate_files) + args.snp_input, + gene_info_df, + hic_df, + args.output_dir, + eqtl_project_db, + C.rs_merge_arch_fp, + logger, + args.pchic, + args.suppress_intermediate_files) if not args.suppress_intermediate_files: snp_df.to_csv(os.path.join( args.output_dir, 'snps.txt'), sep='\t', index=False) @@ -887,41 +943,49 @@ def log_settings(args, logger): if args.non_spatial: map_non_spatial_eqtls( - snp_df, - pd.DataFrame(), - tissues, - hic_df, - C, - args, - genotypes_fp, - eqtl_project_db, - commons_db, - covariates_dir, - expression_dir, - logger) + snp_df, + pd.DataFrame(), + tissues, + hic_df, + C, + args, + genotypes_fp, + eqtl_project_db, + commons_db, + covariates_dir, + expression_dir, + logger, + args.pchic) else: gene_df, eqtl_df = map_spatial_eqtls( - snp_list, - C, - hic_df, - args, - logger, - commons_db, - tissues, - genotypes_fp, - eqtl_project_db, - covariates_dir, - expression_dir) + snp_list, + C, + hic_df, + args, + logger, + commons_db, + tissues, + genotypes_fp, + eqtl_project_db, + covariates_dir, + expression_dir, + args.pchic + ) + eqtl_df = multi_test_correction(eqtl_df, args.multi_test.lower()) - logger.write(' * {} eQTL associations passed FDR <= {}.'.format( - len(eqtl_df[eqtl_df['adj_pval'] <= args.fdr_threshold]), - args.fdr_threshold)) + #logger.write(' * {} eQTL associations passed FDR <= {}.'.format( + # len(eqtl_df[eqtl_df['adj_pval'] <= args.fdr_threshold]), + # args.fdr_threshold)) logger.write(' * eQTLs mapped at MAF >= {} and pval threshold <={}.'.format( args.maf_threshold, args.pval_threshold)) if not args.suppress_intermediate_files: eqtl_df.to_csv(os.path.join( args.output_dir, 'eqtls.txt'), sep='\t', index=False) eqtl_df = eqtl_df[eqtl_df['adj_pval'] <= args.fdr_threshold] + + if eqtl_df.empty: + logger.write(' * No significant eQTL associations found. Program exiting.') + sys.exit() if not args.do_not_produce_summary: if not args.no_afc: @@ -939,7 +1003,7 @@ def log_settings(args, logger): summary.produce_summary( eqtl_df, snp_df, gene_df, expression_table_fp, args.fdr_threshold, args.output_dir, - args.num_processes, args.output_format, args.no_afc, logger) + args.num_processes, args.output_format, args.no_afc, logger, args.pchic) msg = 'Done.\nTotal time elasped: {:.2f} mins.'.format( (time.time() - start_time)/60) logger.write(msg) diff --git a/codes3d/genes.py b/codes3d/genes.py index cea6e11..271947c 100755 --- a/codes3d/genes.py +++ b/codes3d/genes.py @@ -12,7 +12,7 @@ from itertools import repeat -def get_gene_fragments(gene_df, restriction_enzymes, db): +def get_gene_fragments(gene_df, restriction_enzymes, db, pchic=False): db.dispose() gene_df = gene_df.sort_values(by=['id']) fragment_df = [] @@ -20,20 +20,42 @@ def get_gene_fragments(gene_df, restriction_enzymes, db): chunks = [gene_df[i:i+chunksize] for i in range(0, gene_df.shape[0], chunksize)] for enzyme in restriction_enzymes: - table = 'gene_lookup_{}' + if pchic: + table = 'gene_lookup_pchic_{}' + else: + table = 'gene_lookup_{}' + if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites table = table.format('mboi') else: table = table.format(enzyme.lower()) - sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}''' + + if pchic: + sql = '''SELECT * FROM {} WHERE gencode_id = '{}' ''' + else: + sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}''' + with db.connect() as con: for chunk in chunks: - df = pd.read_sql(sql.format( - table, chunk['id'].min(), chunk['id'].max()), con=con) - df['enzyme'] = enzyme - fragment_df.append(df) - fragment_df = pd.concat(fragment_df) - gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom']) + if not pchic: + df = pd.read_sql(sql.format( + table, chunk['id'].min(), chunk['id'].max()), con=con) + df['enzyme'] = enzyme + fragment_df.append(df) + else: + for _, row in chunk.iterrows(): + df = pd.read_sql(sql.format( + table, row['gencode_id']), con=con) + df['enzyme'] = enzyme + fragment_df.append(df) + fragment_df = pd.concat(fragment_df).drop_duplicates() + if pchic: + gene_df = pd.merge(gene_df, fragment_df, how='inner', + left_on = ['chrom','start','end','name','gencode_id'], + right_on = ['chr', 'start', 'end', 'gene', 'gencode_id']) + else: + gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom']) + return gene_df @@ -70,59 +92,88 @@ def process_snp_genes( def find_snp_genes( chunk_df, enzyme, - enzyme_genes): + enzyme_genes, + pchic=False): db.dispose() - chunk_df = chunk_df.sort_values(by=['fragment']) - chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0] - table = 'gene_lookup_{}' + if pchic: + #celline = chunk_df['cell_line'].drop_duplicates().to_list() + table = 'gene_lookup_pchic_{}' + else: + chunk_df = chunk_df.sort_values(by=['fragment']) + chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0] + table = 'gene_lookup_{}' + if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites table = table.format('mboi') else: table = table.format(enzyme.lower()) - chunk_df['fragment'] = chunk_df['fragment'].astype(int) - sql = ''' SELECT * FROM {0} - JOIN genes on {0}.id=genes.id - WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}''' - sql = sql.format(table, chrom, + + if pchic: + inter_df_ls = chunk_df['inter_fid'].unique().tolist() + if len(inter_df_ls) > 1: + inter_df_ls = tuple(inter_df_ls) + sql = '''SELECT * FROM {} WHERE frag_id IN {}'''.format(table, inter_df_ls) + else: + inter_df_ls = (inter_df_ls[0]) + sql = '''SELECT * FROM {} WHERE frag_id = {}'''.format(table, inter_df_ls) + else: + chunk_df['fragment'] = chunk_df['fragment'].astype(int) + sql = ''' SELECT * FROM {0} + JOIN genes on {0}.id=genes.id + WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}''' + sql = sql.format(table, chrom, chunk_df['fragment'].min(), chunk_df['fragment'].max()) df = pd.DataFrame() with db.connect() as con: df = pd.read_sql_query(sql, con) if df.empty: return - df = df.loc[:, ~df.columns.duplicated()] - df = df.rename( - columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr', - 'start': 'gene_start', 'end': 'gene_end'}) - chunk_df = pd.merge(chunk_df, df, how='inner', + + if pchic: + df = df.rename( + columns={'chr': 'gene_chr', 'start': 'gene_start', 'end': 'gene_end'}) + chunk_df = pd.merge(chunk_df, df, how= 'inner', sort=False, left_on='inter_fid', + right_on='frag_id') + else: + df = df.loc[:, ~df.columns.duplicated()] + df = df.rename( + columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr', + 'start': 'gene_start', 'end': 'gene_end'}) + chunk_df = pd.merge(chunk_df, df, how='inner', left_on=['fragment_chr', 'fragment'], right_on=['gene_chr', 'frag_id']) enzyme_genes.append(chunk_df) -def fetch_hic_libs(db): +def fetch_3dgi_libs(db, pchic=False): with db.connect() as con: - hic_libs = pd.read_sql_query( - 'SELECT library, enzyme, rep_count FROM meta_hic', - con=con) - return hic_libs.drop_duplicates() - + if pchic: + _3dgi_libs = pd.read_sql_query( + 'SELECT library, enzyme, rep_count FROM meta_pchic', con=con) + else: + _3dgi_libs = pd.read_sql_query( + 'SELECT library, enzyme, rep_count FROM meta_hic', con=con) + + return _3dgi_libs.drop_duplicates() def get_gene_by_id( snp_df, inter_df, _db, logger, -): - logger.write('Identifying genes interacting with SNPs in...') + pchic=False): + if pchic: + logger.write('Identifying gene promoters interacting with SNPs in...') + else: + logger.write('Identifying genes interacting with SNPs in...') global db db = _db start_time = time.time() enzymes = inter_df['enzyme'].drop_duplicates().tolist() all_genes_df = [] #db = create_engine(db_url, echo=False, poolclass=NullPool) - hic_libs = fetch_hic_libs(db) - hic_libs = hic_libs.rename(columns={'rep_count': 'cell_line_replicates'}) + _3dgi_libs = fetch_3dgi_libs(db, pchic) + _3dgi_libs = _3dgi_libs.rename(columns={'rep_count': 'cell_line_replicates'}) for enzyme in enzymes: manager = multiprocessing.Manager() num_processes = int(min(16, multiprocessing.cpu_count()/2)) @@ -130,79 +181,112 @@ def get_gene_by_id( enzyme_df = [] with multiprocessing.Pool(processes=num_processes) as pool: df = inter_df[inter_df['enzyme'] == enzyme] - snp_interactions = [ - df[df['fragment_chr'] == chrom] - for chrom in df['fragment_chr'].drop_duplicates().to_list() - ] - desc = ' * Hi-C libraries restricted with {}'.format(enzyme) + if pchic: + df_subset = df[['p_fid', 'oe_fid', 'n_reads', 'score', + 'query_type', 'query_fragment', 'replicate', 'cell_line', 'enzyme']] + df_subset['inter_fid'] = np.where(df_subset['query_fragment'] == + df_subset['p_fid'], df_subset['oe_fid'], df_subset['p_fid']) + snp_interactions = [df_subset[df_subset['cell_line'] == celline] + for celline in df_subset['cell_line'].to_list() + ] + desc = ' * PCHi-C libraries restricted with {}'.format(enzyme) + else: + snp_interactions = [ + df[df['fragment_chr'] == chrom] + for chrom in df['fragment_chr'].drop_duplicates().to_list() + ] + desc = ' * Hi-C libraries restricted with {}'.format(enzyme) bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}' - ''' - for snp in snp_interactions: - find_snp_genes(snp, - enzyme, - enzyme_genes, - db) - ''' for _ in tqdm.tqdm(pool.istarmap( find_snp_genes, zip(snp_interactions, repeat(enzyme), - repeat(enzyme_genes))), + repeat(enzyme_genes), + repeat(pchic))), total=len(snp_interactions), desc=desc, unit='batches', ncols=80, bar_format=bar_format): pass - for df in enzyme_genes: enzyme_df.append(df) enzyme_df = pd.concat(enzyme_df) enzyme_df = enzyme_df.merge( - hic_libs, how='left', + _3dgi_libs, how='left', left_on=['cell_line', 'enzyme'], right_on=['library', 'enzyme']) - enzyme_df['interactions'] = enzyme_df.groupby( - ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[ - 'fragment'].transform('count') - df = enzyme_df[ - ['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'] - ].drop_duplicates() - df['replicates'] = df.groupby( - ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[ - 'replicate'].transform('count') - enzyme_df = enzyme_df.merge( - df, how='left', - on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']) - enzyme_df = enzyme_df.drop( - columns=['fragment_chr', 'fragment', + if pchic: + enzyme_df = enzyme_df.drop( + columns=['p_fid','oe_fid','library','frag_id','cell_line_replicates']) + else: + enzyme_df['interactions'] = enzyme_df.groupby( + ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[ + 'fragment'].transform('count') + df = enzyme_df[ + ['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'] + ].drop_duplicates() + df['replicates'] = df.groupby( + ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[ + 'replicate'].transform('count') + enzyme_df = enzyme_df.merge( + df, how='left', + on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']) + enzyme_df = enzyme_df.drop( + columns=['fragment_chr', 'fragment', 'frag_id', 'library', 'replicate'] - ) + ) enzyme_df = enzyme_df.drop_duplicates() all_genes_df.append(enzyme_df.drop_duplicates()) - logger.write(' * Filtering gene interactions...') + if not pchic: + logger.write(' * Filtering gene interactions...') all_genes_df = pd.concat(all_genes_df) - all_genes_df = all_genes_df.merge( - snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'], - right_on=['fragment', 'chrom', 'enzyme'], how='inner' - ) # .drop_duplicates() - all_genes_df['sum_cell_lines'] = all_genes_df.groupby( - ['snp', 'gene'])['cell_line'].transform('count') - all_genes_df['sum_interactions'] = all_genes_df.groupby( - ['snp', 'gene'])[ - 'interactions'].transform('sum') - all_genes_df['sum_replicates'] = all_genes_df.groupby( - ['snp', 'gene'])[ - 'replicates'].transform('sum') - condition = ( - (all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) & - (all_genes_df['sum_replicates'] < 2) & - (all_genes_df['sum_cell_lines'] < 2)) - gene_df = all_genes_df[~condition].drop_duplicates() - cols = ['snp', 'chrom', 'locus', 'variant_id', + + if pchic: + all_genes_df = all_genes_df.drop_duplicates() + df = all_genes_df[['query_fragment','gencode_id','cell_line','n_reads', + 'score']].drop_duplicates() + df['N_reads'] = df.groupby(['query_fragment','gencode_id','cell_line'])[ + 'n_reads'].transform('sum') + df['Score'] = df.groupby(['query_fragment','gencode_id','cell_line','N_reads'])[ + 'score'].transform('mean').round(2) + df = df[['query_fragment','gencode_id','cell_line','N_reads','Score']].drop_duplicates() + all_genes_df = all_genes_df.merge( + df, on=['query_fragment','gencode_id','cell_line'], + how='left' + ).drop(['n_reads','score','inter_fid'], axis=1).drop_duplicates() + all_genes_df = all_genes_df.merge( + snp_df, left_on=['query_fragment','enzyme'], + right_on=['fragment','enzyme'], + how='inner' + ).drop_duplicates() + gene_df = all_genes_df[['snp', 'chrom', 'locus', 'variant_id', 'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end', - 'interactions', 'replicates', 'cell_line', 'cell_line_replicates', - 'sum_interactions', 'sum_replicates', 'sum_cell_lines'] + 'N_reads', 'Score', 'cell_line']].drop_duplicates() + else: + all_genes_df = all_genes_df.merge( + snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'], + right_on=['fragment', 'chrom', 'enzyme'], how='inner' + ) # .drop_duplicates() + all_genes_df['sum_cell_lines'] = all_genes_df.groupby( + ['snp', 'gene'])['cell_line'].transform('count') + all_genes_df['sum_interactions'] = all_genes_df.groupby( + ['snp', 'gene'])[ + 'interactions'].transform('sum') + all_genes_df['sum_replicates'] = all_genes_df.groupby( + ['snp', 'gene'])[ + 'replicates'].transform('sum') + condition = ( + (all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) & + (all_genes_df['sum_replicates'] < 2) & + (all_genes_df['sum_cell_lines'] < 2)) + gene_df = all_genes_df[~condition].drop_duplicates() + cols = ['snp', 'chrom', 'locus', 'variant_id', + 'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end', + 'interactions', 'replicates', 'cell_line', 'cell_line_replicates', + 'sum_interactions', 'sum_replicates', 'sum_cell_lines'] logger.write(' Time elasped: {:.2f} mins.'.format( (time.time() - start_time)/60)) - return gene_df[cols].drop_duplicates() - + if pchic: + return gene_df + else: + return gene_df[cols].drop_duplicates() def get_gene_by_position(df, db): gene_df = [] @@ -288,6 +372,7 @@ def get_gene_info( output_dir, db, logger, + pchic = False, suppress_intermediate_files=False): enzymes = hic_df['enzyme'].drop_duplicates().tolist() gene_df = [] @@ -340,7 +425,10 @@ def get_gene_info( len(omitted_genes)) msg = msg + ':\n\t{}'.format(', '.join(omitted_genes)) logger.write(msg) - - gene_df = get_gene_fragments(gene_df, enzymes, db) + gene_df = get_gene_fragments(gene_df, enzymes, db, pchic) gene_df = gene_df.rename(columns={'frag_id': 'fragment'}) + if pchic: + gene_df = gene_df.drop(['chr','gene'], axis=1) + else: + pass return(gene_df) diff --git a/codes3d/interactions.py b/codes3d/interactions.py index 4fb486d..ba02b30 100755 --- a/codes3d/interactions.py +++ b/codes3d/interactions.py @@ -17,47 +17,103 @@ def get_cell_interactions( replicate, query_type, df, - interactions): + interactions, + pchic=False): cell_line = os.path.dirname(replicate).split('/')[-1] rep_interactions = [] # TODO: Add implementation for PostgreSQL database - sql = ''' - SELECT chr2, fragment2 FROM interactions WHERE chr1='{}' AND fragment1={} - ''' - with create_engine('sqlite:///{}'.format(replicate), echo=False).connect() as con: - for idx, row in df.iterrows(): - from_db = pd.read_sql_query(sql.format( - row['chrom'][3:], row['fragment']), con=con) - if not from_db.empty: - from_db = from_db.rename( - columns={'chr2': 'fragment_chr', 'fragment2': 'fragment'}) - #from_db[query_type] = row[query_type] - from_db['fragment_chr'] = 'chr' + \ - from_db['fragment_chr'].astype(str) - from_db['query_type'] = query_type - from_db['query_chr'] = row['chrom'] - from_db['query_fragment'] = row['fragment'] - from_db['fragment'] = from_db['fragment'].astype('int64') - rep_interactions.append(from_db) - if len(rep_interactions) == 0: - return - rep_interactions = pd.concat(rep_interactions) - # A fragment can have many interactions with the same fragment - rep_interactions = rep_interactions.drop_duplicates() # Keep unique replicates - rep = os.path.basename(replicate).split('.')[0] - rep = rep[:rep.rfind('_merged')].split('_')[-1] - rep_interactions['replicate'] = rep - rep_interactions['cell_line'] = cell_line - rep_interactions['enzyme'] = enzyme - self_ligation = ( # Filter self-ligating fragments - (rep_interactions['fragment_chr'] == rep_interactions['query_chr']) &\ - (rep_interactions['fragment'] == rep_interactions['query_fragment'])) - rep_interactions = rep_interactions[~self_ligation] + if pchic: + sql = '''SELECT * from interactions WHERE p_fid={} OR oe_fid={}''' + with create_engine('sqlite:///{}'.format(replicate), echo=False).connect() as con: + for idx, row in df.iterrows(): + from_db = pd.read_sql_query(sql.format( + row['fragment'], row['fragment']), con=con) + if not from_db.empty: + from_db_subset = from_db[['p_fid','p_name','oe_fid','oe_name', + 'n_reads','score']] + from_db_subset['query_type'] = query_type + from_db_subset['query_fragment'] = row['fragment'] + rep_interactions.append(from_db_subset) + if len(rep_interactions) == 0: + return + rep_interactions = pd.concat(rep_interactions) + rep_interactions = rep_interactions.drop_duplicates() + rep = os.path.basename(replicate).split('.')[0] + rep_interactions['replicate'] = rep + rep_interactions['cell_line'] = cell_line + rep_interactions['enzyme'] = enzyme + interactions.append(rep_interactions) - interactions.append(rep_interactions) + else: + sql = ''' + SELECT chr2, fragment2 FROM interactions WHERE chr1='{}' AND fragment1={} + ''' + with create_engine('sqlite:///{}'.format(replicate), echo=False).connect() as con: + for idx, row in df.iterrows(): + from_db = pd.read_sql_query(sql.format( + row['chrom'][3:], row['fragment']), con=con) + if not from_db.empty: + from_db = from_db.rename( + columns={'chr2': 'fragment_chr', 'fragment2': 'fragment'}) + #from_db[query_type] = row[query_type] + from_db['fragment_chr'] = 'chr' + \ + from_db['fragment_chr'].astype(str) + from_db['query_type'] = query_type + from_db['query_chr'] = row['chrom'] + from_db['query_fragment'] = row['fragment'] + from_db['fragment'] = from_db['fragment'].astype('int64') + rep_interactions.append(from_db) + if len(rep_interactions) == 0: + return + rep_interactions = pd.concat(rep_interactions) + # A fragment can have many interactions with the same fragment + rep_interactions = rep_interactions.drop_duplicates() # Keep unique replicates + rep = os.path.basename(replicate).split('.')[0] + rep = rep[:rep.rfind('_merged')].split('_')[-1] + rep_interactions['replicate'] = rep + rep_interactions['cell_line'] = cell_line + rep_interactions['enzyme'] = enzyme + self_ligation = ( # Filter self-ligating fragments + (rep_interactions['fragment_chr'] == rep_interactions['query_chr']) &\ + (rep_interactions['fragment'] == rep_interactions['query_fragment'])) + rep_interactions = rep_interactions[~self_ligation] + interactions.append(rep_interactions) +def filter_inter_for_pchic(interactions, query_type): + pchic_interactions = [] + if query_type == 'snp': + for row in interactions.itertuples(index=False): + # remove snps that interact with unannotated oe_frag + if row.query_fragment == row.p_fid and \ + row.oe_name != ".": + pchic_interactions.append(row) + elif row.query_fragment == row.oe_fid and \ + row.oe_name != ".": + pchic_interactions.append(row) + elif row.query_fragment == row.oe_fid and \ + row.oe_name == ".": + pchic_interactions.append(row) + elif row.query_fragment == row.p_fid and \ + row.oe_name == ".": + pass + elif query_type == 'gencode_id': + for row in interactions.itertuples(index=False): + if (row.query_fragment == row.p_fid) and \ + row.oe_name != ".": + pchic_interactions.append(row) + elif row.query_fragment == row.oe_fid and \ + row.oe_name != ".": + pchic_interactions.append(row) + elif row.query_fragment == row.p_fid and \ + row.oe_name == ".": + pchic_interactions.append(row) + elif row.query_fragment == row.oe_fid and \ + row.oe_name == ".": + pass + pchic_interactions = pd.DataFrame(pchic_interactions) + return pchic_interactions def find_interactions( fragment_df, @@ -66,16 +122,19 @@ def find_interactions( hic_df, num_processes, logger, + pchic=False, suppress_intermediate_files=False): """Finds fragments that interact with gene fragments. """ start_time = time.time() logger.write("Finding chromatin interactions in...") - # print(fragment_df) restriction_enzymes = fragment_df['enzyme'].drop_duplicates().tolist() interactions_df = [] for enzyme in restriction_enzymes: - hic_data_dir = os.path.join(lib_dir, 'hic', enzyme, 'hic_data') + if pchic: + _3dgi_data_dir = os.path.join(lib_dir, 'pchic', enzyme, 'pchic_data') + else: + _3dgi_data_dir = os.path.join(lib_dir, 'hic', enzyme, 'hic_data') manager = multiprocessing.Manager() gene_interactions = manager.list() enzyme_qc_df = manager.list() @@ -84,16 +143,19 @@ def find_interactions( cell_lines = hic_df[hic_df['enzyme'] == enzyme]['library'].tolist() replicates = [] for cell_line in cell_lines: # TODO: Replace with database - replicates += [os.path.join(hic_data_dir, cell_line, replicate) + replicates += [os.path.join(_3dgi_data_dir, cell_line, replicate) for replicate in os.listdir( - os.path.join(hic_data_dir, cell_line)) + os.path.join(_3dgi_data_dir, cell_line)) if replicate.endswith('.db')] enzyme_df = fragment_df[fragment_df['enzyme'] == enzyme] enzyme_df = enzyme_df[['fragment', 'chrom', 'enzyme']].drop_duplicates() with multiprocessing.Pool(num_processes) as pool: - desc = ' * Hi-C libraries restricted with {}'.format(enzyme) + if pchic: + desc = ' * PCHi-C libraries restricted with {}'.format(enzyme) + else: + desc = ' * Hi-C libraries restricted with {}'.format(enzyme) bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}' bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}' for _ in tqdm.tqdm( @@ -103,7 +165,8 @@ def find_interactions( replicates, repeat(query_type), repeat(enzyme_df), - repeat(gene_interactions) + repeat(gene_interactions), + repeat(pchic) ) ), total=len(replicates), @@ -114,12 +177,26 @@ def find_interactions( pass for df in gene_interactions: interactions_df.append(df) - interactions_df = pd.concat(interactions_df) - cols = ['cell_line', 'query_type', 'query_chr', 'query_fragment', - 'fragment_chr', 'fragment', # 'fragment_distance', - 'replicate', 'enzyme'] - interactions_df = interactions_df[cols] - interactions_df = interactions_df.drop_duplicates() - logger.write(' Time elasped: {:.2f} mins.'.format( - (time.time() - start_time)/60)) + if not interactions_df: + msg = '''Program exiting: Chromatin interactions involving input + SNP(s)/gene(s) could not be found.''' + sys.exit(msg) + logger.write(msg) + else: + interactions_df = pd.concat(interactions_df) + if pchic: + cols = ['p_fid', 'p_name', 'oe_fid', 'oe_name', 'n_reads', + 'score', 'query_type', 'query_fragment', + 'replicate', 'cell_line', 'enzyme'] + interactions_df = interactions_df[cols] + interactions_df = interactions_df.drop_duplicates() + interactions_df = filter_inter_for_pchic(interactions_df, query_type) + logger.write(' Time elasped: {:.2f} mins.'.format((time.time() - start_time)/60)) + else: + cols = ['cell_line', 'query_type', 'query_chr', 'query_fragment', + 'fragment_chr', 'fragment', 'replicate', 'enzyme'] + interactions_df = interactions_df[cols] + interactions_df = interactions_df.drop_duplicates() + logger.write(' Time elasped: {:.2f} mins.'.format((time.time() - start_time)/60)) + return interactions_df diff --git a/codes3d/snps.py b/codes3d/snps.py index 4772e6c..edc638a 100755 --- a/codes3d/snps.py +++ b/codes3d/snps.py @@ -29,6 +29,7 @@ def find_snps( num_processes, _db, logger, + pchic=False, suppress_intermediate_files=False ): start_time = time.time() @@ -37,17 +38,27 @@ def find_snps( global db db = _db enzymes = inter_df['enzyme'].drop_duplicates().tolist() - hic_libs = genes.fetch_hic_libs(db) - hic_libs = hic_libs.rename(columns={'rep_count': 'cell_line_replicates'}) + _3dgi_libs = genes.fetch_3dgi_libs(db, pchic) + _3dgi_libs = _3dgi_libs.rename(columns={'rep_count': 'cell_line_replicates'}) inter_df = inter_df.merge( - hic_libs, how='left', - left_on=['cell_line', 'enzyme'], right_on=['library', 'enzyme']) - default_chrom = ['chr' + str(i) - for i in list(range(1, 23))] + ['X', 'Y', 'M'] - chrom_list = inter_df['fragment_chr'].drop_duplicates().tolist() - chrom_list = [i for i in default_chrom if i in chrom_list] - inter_df = inter_df[inter_df['fragment_chr'].isin(default_chrom)] - inter_df = inter_df.astype({'fragment': int}) + _3dgi_libs, how='left', + left_on=['cell_line', 'enzyme'], right_on=['library', 'enzyme']) + if pchic: + inter_df = inter_df[['p_fid','oe_fid','n_reads','score', + 'query_type', 'query_fragment', 'replicate', 'cell_line', 'enzyme', + 'library']] + inter_df['inter_frag'] = np.where(inter_df['query_fragment'] == + inter_df['p_fid'], inter_df['oe_fid'], inter_df['p_fid']) + inter_df = inter_df[['n_reads','score','query_type','query_fragment','inter_frag', + 'replicate','cell_line','enzyme']].drop_duplicates() + else: + default_chrom = ['chr' + str(i) + for i in list(range(1, 23))] + ['X', 'Y', 'M'] + chrom_list = inter_df['fragment_chr'].drop_duplicates().tolist() + chrom_list = [i for i in default_chrom if i in chrom_list] + inter_df = inter_df[inter_df['fragment_chr'].isin(default_chrom)] + inter_df = inter_df.astype({'fragment': int}) + gene_info_df = gene_info_df.rename( columns={ 'name': 'gene', @@ -59,113 +70,208 @@ def find_snps( all_snps = [] all_genes = [] all_eqtls = [] - logger.write('Finding SNPs within fragments interacting with genes in...') - for chrom in sorted(chrom_list): - chrom_dir = os.path.join(output_dir, chrom) - #if os.path.exists(os.path.join(chrom_dir, 'eqtls.txt')): - # logger.write(' Warning: {} already exists. Skipping.'.format( - # os.path.join(chrom_dir, 'eqtls.txt'))) - # continue - logger.write(' Chromosome {}'.format(chrom)) - snp_cols = ['snp', 'variant_id', 'chr', - 'locus', 'id', 'fragment', 'enzyme'] - chrom_df = inter_df[inter_df['fragment_chr'] == chrom] - chrom_df = chrom_df.astype({'fragment': int, - 'fragment_chr': object}) - enzymes = chrom_df['enzyme'].drop_duplicates().tolist() + + if pchic: + logger.write('Finding SNPs within fragments interacting with gene promoters in...') + else: + logger.write('Finding SNPs within fragments interacting with genes in...') + + if pchic: snp_df = [] for enzyme in enzymes: - enzyme_df = chrom_df[chrom_df['enzyme'] == enzyme] + enzyme_dir = os.path.join(output_dir, enzyme) + enzyme_df = inter_df[inter_df['enzyme'] == enzyme] enzyme_df = enzyme_df.merge( - gene_info_df, how='inner', - left_on=['query_chr', 'query_fragment', 'enzyme'], - right_on=['chrom', 'gene_fragment', 'enzyme']) + gene_info_df, how='inner', + left_on = ['query_fragment', 'enzyme'], + right_on = ['gene_fragment', 'enzyme']) fragment_df = enzyme_df[ - ['gencode_id', 'fragment_chr', 'fragment']].drop_duplicates() - enzyme_df = enzyme_df.sort_values(by=['fragment']) + ['gencode_id','query_fragment','inter_frag','project']].drop_duplicates() + enzyme_df = enzyme_df.drop(columns=['project']).drop_duplicates() + enzyme_df = enzyme_df.sort_values(by=['inter_frag']) chunksize = 20000 enzyme_chunks = [enzyme_df[i:i+chunksize] - for i in range(0, enzyme_df.shape[0], chunksize)] + for i in range(0, enzyme_df.shape[0], chunksize)] manager = multiprocessing.Manager() snps = manager.list() - desc = ' * Hi-C libraries restricted with {}'.format( - enzyme) + desc = ' * PCHi-C libraries restricted with {}'.format(enzyme) bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}' - ''' - for df in tqdm.tqdm(enzyme_chunks, desc=desc, unit='batches', - ncols=80, bar_format=bar_format): - find_gene_snps( - df, - enzyme, - snps) - ''' with multiprocessing.Pool(processes=4) as pool: for _ in tqdm.tqdm( pool.istarmap( find_gene_snps, zip(enzyme_chunks, repeat(enzyme), - repeat(snps)) - ), + repeat(snps), + repeat(pchic)) + ), total=len(enzyme_chunks), desc=desc, unit='batches', ncols=80, bar_format=bar_format): pass + for df in snps: + df['enzyme'] = enzyme + snp_df.append(df) + if len(snp_df) == 0: + continue + snp_df = pd.concat(snp_df) + logger.verbose = False + gene_df, snp_df = filter_snp_fragments( + snp_df, logger, pchic) + snp_df.sort_values(by=['variant_id'], inplace=True) + snp_list = snp_df['variant_id'].drop_duplicates().tolist() + batchsize = 2000 + snp_batches = [snp_list[i:i + batchsize] + for i in range(0, len(snp_list), batchsize)] + chrom_eqtl_df = [] + for batch_num, snp_batch in enumerate(snp_batches): + if len(snp_batches) > 1: + logger.verbose = True + logger.write(' Mapping eQTLs batch {} of {}'.format( + batch_num+1, len(snp_batches))) + logger.verbose = False + batch_gene_df = gene_df[gene_df['variant_id'].isin(snp_batch)] + eqtl_df = eqtls.map_eqtls( + batch_gene_df, + tissues, + output_dir, + C, + genotypes_fp, + num_processes, + eqtl_project_db, + covariates_dir, + expression_dir, + pval_threshold, + maf_threshold, + fdr_threshold, + logger) + if eqtl_df is None: + continue + chrom_eqtl_df.append(eqtl_df) + if len(chrom_eqtl_df) > 0: + chrom_eqtl_df = pd.concat(chrom_eqtl_df) + else: + chrom_eqtl_df = pd.DataFrame() + if not suppress_intermediate_files: + os.makedirs(enzyme_dir, exist_ok=True) + snp_df.to_csv(os.path.join(enzyme_dir, 'snps.txt'), + sep='\t', index=False) + gene_df.to_csv(os.path.join(enzyme_dir, 'genes.txt'), + sep='\t', index=False) + chrom_eqtl_df.to_csv(os.path.join(enzyme_dir, 'eqtls.txt'), + sep='\t', index=False) + all_eqtls.append(chrom_eqtl_df) + all_snps.append(snp_df) + all_genes.append(gene_df) + logger.verbose = True - for df in snps: - df['enzyme'] = enzyme - snp_df.append(df) - if len(snp_df) == 0: - continue - snp_df = pd.concat(snp_df) - logger.verbose = False - gene_df, snp_df = filter_snp_fragments( - snp_df, logger) - snp_df.sort_values(by=['variant_id'], inplace=True) - snp_list = snp_df['variant_id'].drop_duplicates().tolist() - batchsize = 2000 - snp_batches = [snp_list[i:i + batchsize] - for i in range(0, len(snp_list), batchsize)] - chrom_eqtl_df = [] - for batch_num, snp_batch in enumerate(snp_batches): - if len(snp_batches) > 1: - logger.verbose = True - logger.write(' Mapping eQTLs batch {} of {}'.format( - batch_num+1, len(snp_batches))) - logger.verbose = False - batch_gene_df = gene_df[gene_df['variant_id'].isin(snp_batch)] - eqtl_df = eqtls.map_eqtls( - batch_gene_df, - tissues, - output_dir, - C, - genotypes_fp, - num_processes, - eqtl_project_db, - covariates_dir, - expression_dir, - pval_threshold, - maf_threshold, - fdr_threshold, - logger) - if eqtl_df is None: + else: + for chrom in sorted(chrom_list): + chrom_dir = os.path.join(output_dir, chrom) + #if os.path.exists(os.path.join(chrom_dir, 'eqtls.txt')): + # logger.write(' Warning: {} already exists. Skipping.'.format( + # os.path.join(chrom_dir, 'eqtls.txt'))) + # continue + logger.write(' Chromosome {}'.format(chrom)) + snp_cols = ['snp', 'variant_id', 'chr', + 'locus', 'id', 'fragment', 'enzyme'] + chrom_df = inter_df[inter_df['fragment_chr'] == chrom] + chrom_df = chrom_df.astype({'fragment': int, + 'fragment_chr': object}) + enzymes = chrom_df['enzyme'].drop_duplicates().tolist() + snp_df = [] + for enzyme in enzymes: + enzyme_df = chrom_df[chrom_df['enzyme'] == enzyme] + enzyme_df = enzyme_df.merge( + gene_info_df, how='inner', + left_on=['query_chr', 'query_fragment', 'enzyme'], + right_on=['chrom', 'gene_fragment', 'enzyme']) + fragment_df = enzyme_df[ + ['gencode_id', 'fragment_chr', 'fragment']].drop_duplicates() + enzyme_df = enzyme_df.sort_values(by=['fragment']) + chunksize = 20000 + enzyme_chunks = [enzyme_df[i:i+chunksize] + for i in range(0, enzyme_df.shape[0], chunksize)] + manager = multiprocessing.Manager() + snps = manager.list() + desc = ' * Hi-C libraries restricted with {}'.format( + enzyme) + bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}' + ''' + for df in tqdm.tqdm(enzyme_chunks, desc=desc, unit='batches', + ncols=80, bar_format=bar_format): + find_gene_snps( + df, + enzyme, + snps) + ''' + with multiprocessing.Pool(processes=4) as pool: + for _ in tqdm.tqdm( + pool.istarmap( + find_gene_snps, + zip(enzyme_chunks, + repeat(enzyme), + repeat(snps), + repeat(pchic)) + ), + total=len(enzyme_chunks), desc=desc, unit='batches', + ncols=80, bar_format=bar_format): + pass + + for df in snps: + df['enzyme'] = enzyme + snp_df.append(df) + if len(snp_df) == 0: continue - chrom_eqtl_df.append(eqtl_df) - if len(chrom_eqtl_df) > 0: - chrom_eqtl_df = pd.concat(chrom_eqtl_df) - else: - chrom_eqtl_df = pd.DataFrame() - if not suppress_intermediate_files: - os.makedirs(chrom_dir, exist_ok=True) - snp_df.to_csv(os.path.join(chrom_dir, 'snps.txt'), - sep='\t', index=False) - gene_df.to_csv(os.path.join(chrom_dir, 'genes.txt'), - sep='\t', index=False) - chrom_eqtl_df.to_csv(os.path.join(chrom_dir, 'eqtls.txt'), - sep='\t', index=False) - all_eqtls.append(chrom_eqtl_df) - all_snps.append(snp_df) - all_genes.append(gene_df) - logger.verbose = True + snp_df = pd.concat(snp_df) + logger.verbose = False + gene_df, snp_df = filter_snp_fragments( + snp_df, logger, pchic) + snp_df.sort_values(by=['variant_id'], inplace=True) + snp_list = snp_df['variant_id'].drop_duplicates().tolist() + batchsize = 2000 + snp_batches = [snp_list[i:i + batchsize] + for i in range(0, len(snp_list), batchsize)] + chrom_eqtl_df = [] + for batch_num, snp_batch in enumerate(snp_batches): + if len(snp_batches) > 1: + logger.verbose = True + logger.write(' Mapping eQTLs batch {} of {}'.format( + batch_num+1, len(snp_batches))) + logger.verbose = False + batch_gene_df = gene_df[gene_df['variant_id'].isin(snp_batch)] + eqtl_df = eqtls.map_eqtls( + batch_gene_df, + tissues, + output_dir, + C, + genotypes_fp, + num_processes, + eqtl_project_db, + covariates_dir, + expression_dir, + pval_threshold, + maf_threshold, + fdr_threshold, + logger) + if eqtl_df is None: + continue + chrom_eqtl_df.append(eqtl_df) + if len(chrom_eqtl_df) > 0: + chrom_eqtl_df = pd.concat(chrom_eqtl_df) + else: + chrom_eqtl_df = pd.DataFrame() + if not suppress_intermediate_files: + os.makedirs(chrom_dir, exist_ok=True) + snp_df.to_csv(os.path.join(chrom_dir, 'snps.txt'), + sep='\t', index=False) + gene_df.to_csv(os.path.join(chrom_dir, 'genes.txt'), + sep='\t', index=False) + chrom_eqtl_df.to_csv(os.path.join(chrom_dir, 'eqtls.txt'), + sep='\t', index=False) + all_eqtls.append(chrom_eqtl_df) + all_snps.append(snp_df) + all_genes.append(gene_df) + logger.verbose = True if len(all_eqtls) == 0: snp_df = pd.DataFrame() gene_df = pd.DataFrame() @@ -179,50 +285,87 @@ def find_snps( (time.time() - start_time)/60)) return snp_df, gene_df, eqtl_df - def find_gene_snps( inter_df, enzyme, - snps#, - #db, -): + snps, + pchic=False): db.dispose() eqtl_project_db.dispose() - table = 'variant_lookup_{}' + + if pchic: + table = 'variant_lookup_pchic_{}' + else: + table = 'variant_lookup_{}' + if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites table = table.format('mboi') else: table = table.format(enzyme.lower()) - chrom = inter_df['fragment_chr'].drop_duplicates().tolist()[0] - sql = '''SELECT * FROM {} WHERE chrom = '{}' AND frag_id >= {} AND frag_id <= {}''' - df = pd.DataFrame() - con = eqtl_project_db.connect() - res = con.execute( - sql.format( - table, chrom, inter_df['fragment'].min(), inter_df['fragment'].max()) - ).fetchall() - con.close() - if res: - df = pd.DataFrame(res, columns=['frag_id', 'chrom', 'id']) - inter_df = inter_df.rename(columns={'chrom': 'gene_chr'}) - df = inter_df.merge( - df, how='inner', left_on=['fragment'], right_on=['frag_id']) - df['id'] = df['id'].astype('Int64') - df = df[df['frag_id'].notnull()] - df = df.drop_duplicates() - snp_df = find_snp_by_id(df, eqtl_project_db) - if snp_df.empty: - return - snp_df = snp_df.merge(df, how='inner', on=['id', 'chrom']) - snp_df['enzyme'] = enzyme - snp_df = snp_df.rename(columns={'rsid': 'snp'}) - snps.append(snp_df.drop_duplicates()) - -def find_snp_by_id(df, db): + if not pchic: + chrom = inter_df['fragment_chr'].drop_duplicates().tolist()[0] + sql = '''SELECT * FROM {} WHERE chrom = '{}' AND frag_id >= {} AND frag_id <= {}''' + df = pd.DataFrame() + con = eqtl_project_db.connect() + res = con.execute( + sql.format( + table, chrom, inter_df['fragment'].min(), inter_df['fragment'].max()) + ).fetchall() + con.close() + if res: + df = pd.DataFrame(res, columns=['frag_id', 'chrom', 'id']) + inter_df = inter_df.rename(columns={'chrom': 'gene_chr'}) + df = inter_df.merge( + df, how='inner', left_on=['fragment'], right_on=['frag_id']) + df['id'] = df['id'].astype('Int64') + df = df[df['frag_id'].notnull()] + df = df.drop_duplicates() + snp_df = find_snp_by_id(df, eqtl_project_db, pchic) + if snp_df.empty: + return + snp_df = snp_df.merge(df, how='inner', on=['id', 'chrom']) + snp_df['enzyme'] = enzyme + snp_df = snp_df.rename(columns={'rsid': 'snp'}) + snps.append(snp_df.drop_duplicates()) + else: + snp_frag = inter_df['inter_frag'].drop_duplicates().tolist() + if len(snp_frag) > 1: + snp_frag = tuple(snp_frag) + sql = '''SELECT * FROM {} WHERE frag_id IN {}''' + else: + snp_frag = (snp_frag[0]) + sql = '''SELECT * FROM {} WHERE frag_id = {}''' + df = pd.DataFrame() + con = eqtl_project_db.connect() + res = con.execute( + sql.format( + table, snp_frag) + ).fetchall() + con.close() + if res: + df = pd.DataFrame(res, columns=['frag_id', 'chrom', 'id']) + inter_df = inter_df.rename(columns={'chrom': 'gene_chr'}) + df = inter_df.merge( + df, how='inner', left_on=['inter_frag'], right_on=['frag_id']) + df['id'] = df['id'].astype('Int64') + df = df[df['frag_id'].notnull()] + df = df.drop_duplicates() + snp_df = find_snp_by_id(df, eqtl_project_db, pchic) + if snp_df.empty: + return + snp_df = snp_df.merge(df, how='inner', on=['id', 'chrom']) + snp_df['enzyme'] = enzyme + snp_df = snp_df.rename(columns={'rsid': 'snp'}) + snps.append(snp_df.drop_duplicates()) + +def find_snp_by_id(df, db, pchic=False): df = df.sort_values(by=['id']) - chunksize = eqtls.calc_chunksize( - df['id'].tolist(), 2000000) + if not pchic: + chunksize = eqtls.calc_chunksize( + df['id'].tolist(), 2000000) + else: + chunksize = 100000 chunks = [df[i:i+chunksize] for i in range(0, len(df), chunksize)] snp_df = [] sql = '''SELECT rsid, variant_id, chrom, locus, id FROM variants WHERE id >= {} @@ -258,42 +401,67 @@ def find_snp_by_variant_id(df, db): def filter_snp_fragments( snp_df, - logger): + logger, + pchic=False): ''' Filter snp-fragment interactions ''' - logger.write(' * Filtering gene-SNP interactions...') - snp_df['interactions'] = snp_df.groupby( - ['variant_id', 'gene', 'cell_line', 'enzyme'])[ - 'gene_fragment'].transform('count') - snp_df['replicates'] = snp_df.groupby( - ['variant_id', 'gene', 'cell_line', 'enzyme'])[ - 'replicate'].transform('count') - snp_df = snp_df.drop(columns=['replicate', 'gene_fragment']) - snp_df = snp_df.drop_duplicates() - snp_df['sum_interactions'] = snp_df.groupby( - ['variant_id', 'gene'])[ - 'interactions'].transform('sum') - snp_df['sum_replicates'] = snp_df.groupby( - ['variant_id', 'gene'])[ - 'replicates'].transform('sum') - snp_df['sum_cell_lines'] = snp_df.groupby( - ['variant_id', 'gene'])['cell_line'].transform('count') - condition = ( - (snp_df['interactions'] / snp_df['cell_line_replicates'] <= 1) & - (snp_df['sum_replicates'] < 2) & - (snp_df['sum_cell_lines'] < 2)) - gene_df = snp_df[~condition] - snp_df = gene_df[ - ['id', 'snp', 'chrom', 'locus', 'variant_id', 'fragment', 'enzyme'] - ] - snp_df.drop_duplicates(inplace=True) - cols = ['snp', 'chrom', 'locus', 'variant_id', + if pchic: + ''' Calculating N_reads and chicago_scores ''' + logger.write(' * Calculating cumulative scores for gene promoter-SNP interactions...') + snp_df = snp_df.drop_duplicates() + snp_df = snp_df.drop(columns=['replicate','id']).drop_duplicates() + snp_df['N_reads'] = snp_df.groupby( + ['variant_id', 'gencode_id', 'inter_frag', + 'gene_fragment', 'cell_line'])[ + 'n_reads'].transform('sum') + snp_df['Score'] = snp_df.groupby(['variant_id', 'gencode_id', 'inter_frag', + 'gene_fragment', 'cell_line','N_reads'])[ + 'score'].transform('mean').round(2) + gene_df = snp_df[['snp', 'chrom', 'locus', 'variant_id', 'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end', - 'interactions', 'replicates', 'enzyme', - 'cell_line', 'cell_line_replicates', 'sum_interactions', - 'sum_replicates', 'sum_cell_lines'] + 'enzyme', 'cell_line', 'N_reads', 'Score']].drop_duplicates() + snp_df = snp_df[ + ['snp', 'chrom', 'locus', 'variant_id','frag_id','enzyme']] + snp_df.drop_duplicates(inplace=True) + cols = ['snp', 'chrom', 'locus', 'variant_id', + 'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end', + 'enzyme', 'cell_line', 'N_reads', 'Score'] + else: + logger.write(' * Filtering gene-SNP interactions...') + snp_df['interactions'] = snp_df.groupby( + ['variant_id', 'gene', 'cell_line', 'enzyme'])[ + 'gene_fragment'].transform('count') + snp_df['replicates'] = snp_df.groupby( + ['variant_id', 'gene', 'cell_line', 'enzyme'])[ + 'replicate'].transform('count') + snp_df = snp_df.drop(columns=['replicate', 'gene_fragment']) + snp_df = snp_df.drop_duplicates() + snp_df['sum_interactions'] = snp_df.groupby( + ['variant_id', 'gene'])[ + 'interactions'].transform('sum') + snp_df['sum_replicates'] = snp_df.groupby( + ['variant_id', 'gene'])[ + 'replicates'].transform('sum') + snp_df['sum_cell_lines'] = snp_df.groupby( + ['variant_id', 'gene'])['cell_line'].transform('count') + condition = ( + (snp_df['interactions'] / snp_df['cell_line_replicates'] <= 1) & + (snp_df['sum_replicates'] < 2) & + (snp_df['sum_cell_lines'] < 2)) + gene_df = snp_df[~condition] + snp_df = gene_df[ + ['id', 'snp', 'chrom', 'locus', 'variant_id', 'fragment', 'enzyme'] + ] + snp_df.drop_duplicates(inplace=True) + cols = ['snp', 'chrom', 'locus', 'variant_id', + 'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end', + 'interactions', 'replicates', 'enzyme', + 'cell_line', 'cell_line_replicates', 'sum_interactions', + 'sum_replicates', 'sum_cell_lines'] return gene_df[cols], snp_df + + def process_rs_df_whole(rs_df, db): rsid_tuple = () if len(rs_df) == 1: @@ -371,14 +539,17 @@ def get_snp_fragments_whole(snp_df, restriction_enzymes, db): return snp_df -def get_snp_fragments(snp_df, restriction_enzymes, db): +def get_snp_fragments(snp_df, restriction_enzymes, db, pchic=False): snp_df = snp_df.sort_values(by=['id']) fragment_df = [] chunksize = 1000 chunks = [snp_df[i:i+chunksize] for i in range(0, snp_df.shape[0], chunksize)] for enzyme in restriction_enzymes: - table = 'variant_lookup_{}' + if pchic: + table = 'variant_lookup_pchic_{}' + else: + table = 'variant_lookup_{}' df = [] if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites table = table.format('mboi') @@ -563,12 +734,13 @@ def get_snp(inputs, db, rs_merge_arch_fp, logger, + pchic=False, suppress_intermediate_files=False ): """Retrieve SNP position and restriction fragments. Args: inputs: File(s) (or stdin) containing SNP rsIDs or genomic positions in bed format (chr:start-end)> - restriction_enzymes: a list of restriction enzymes with which query Hi-C libraries are prepared + restriction_enzymes: a list of restriction enzymes with which query Hi-C/PCHi-C libraries are prepared output_dir: User-specified directory for results. Defaults to inputs directory. postgres_url: path to codes3d_common database suppress_intermediate_files: if 'False', snps.txt file is written to output_dir @@ -594,7 +766,7 @@ def get_snp(inputs, logger.write('We could not find your SNPs in our databases.') sys.exit() snp_df[['id', 'locus']] = snp_df[['id', 'locus']].astype(int) - snp_df = get_snp_fragments(snp_df, restriction_enzymes, db) + snp_df = get_snp_fragments(snp_df, restriction_enzymes, db, pchic) snp_df = snp_df.rename(columns={'rsid': 'snp', 'frag_id': 'fragment'}) if not suppress_intermediate_files: if not merged_snps.empty: diff --git a/codes3d/summary.py b/codes3d/summary.py index 74eeda6..03b8d98 100755 --- a/codes3d/summary.py +++ b/codes3d/summary.py @@ -17,7 +17,7 @@ def produce_summary( eqtl_df, snp_df, gene_df, expression_table_fp, fdr_threshold, output_dir, num_processes, - output_format, no_afc, logger): + output_format, no_afc, logger, pchic=False): """Write final results of eQTL-eGene associations Args: @@ -43,46 +43,75 @@ def produce_summary( eqtl_df['sid_chr'] = eqtl_df['sid_chr'].str[3:] gene_df['pid'] = gene_df['gencode_id'].str.split('.', expand=True)[0] cols = [] - - logger.write(" * Computing Hi-C data...") - df = gene_df[['snp', 'gencode_id', 'cell_line', 'interactions', - 'replicates', 'cell_line_replicates']].merge( - eqtl_df[['snp', 'gencode_id']], on=['snp', 'gencode_id'], how='inner') - df['hic_score'] = df['interactions'] / df['cell_line_replicates'] - df = df.drop_duplicates() - grouped_df = df.groupby(['snp', 'gencode_id']) - df['cell_line_hic_scores'] = df.apply(lambda row: ', '.join( - grouped_df.get_group((row['snp'], row['gencode_id']))[ - 'hic_score'].round(2).astype(str).values.tolist()), axis=1) - df['hic_score'] = df.apply(lambda row: grouped_df.get_group( - (row['snp'], row['gencode_id']))[ - 'hic_score'].sum().round(2), axis=1) - df['cell_lines'] = df.apply(lambda row: ', '.join( - grouped_df.get_group((row['snp'], row['gencode_id']))[ - 'cell_line'].values.tolist()), axis=1) - eqtl_df = eqtl_df.merge(df[ - ['snp', 'gencode_id', 'cell_lines', 'hic_score', 'cell_line_hic_scores'] - ].drop_duplicates(), on=['snp', 'gencode_id'], how='left') - + if pchic: + df = gene_df[['snp', 'gencode_id', 'cell_line','N_reads','Score']].merge( + eqtl_df[['snp', 'gencode_id']], on=['snp', 'gencode_id'], how='inner') + df = df.drop_duplicates() + grouped_df = df.groupby(['snp', 'gencode_id']) + df['cell_lines'] = df.apply(lambda row: ', '.join( + grouped_df.get_group((row['snp'], row['gencode_id']))[ + 'cell_line'].values.tolist()), axis=1) + df['chicago_scores'] = df.apply(lambda row: ', '.join( + grouped_df.get_group((row['snp'], row['gencode_id']))[ + 'Score'].astype(str).values.tolist()), axis=1) + df['N_reads_per_cell_line'] = df.apply(lambda row: ', '.join( + grouped_df.get_group((row['snp'], row['gencode_id']))[ + 'N_reads'].astype(str).values.tolist()), axis=1) + eqtl_df = eqtl_df.merge(df[ + ['snp', 'gencode_id', 'cell_lines', 'N_reads_per_cell_line', 'chicago_scores'] + ].drop_duplicates(), on=['snp', 'gencode_id'], how='left') + + + else: + logger.write(" * Computing Hi-C data...") + df = gene_df[['snp', 'gencode_id', 'cell_line', 'interactions', + 'replicates', 'cell_line_replicates']].merge( + eqtl_df[['snp', 'gencode_id']], on=['snp', 'gencode_id'], how='inner') + df['hic_score'] = df['interactions'] / df['cell_line_replicates'] + df = df.drop_duplicates() + grouped_df = df.groupby(['snp', 'gencode_id']) + df['cell_line_hic_scores'] = df.apply(lambda row: ', '.join( + grouped_df.get_group((row['snp'], row['gencode_id']))[ + 'hic_score'].round(2).astype(str).values.tolist()), axis=1) + df['hic_score'] = df.apply(lambda row: grouped_df.get_group( + (row['snp'], row['gencode_id']))[ + 'hic_score'].sum().round(2), axis=1) + df['cell_lines'] = df.apply(lambda row: ', '.join( + grouped_df.get_group((row['snp'], row['gencode_id']))[ + 'cell_line'].values.tolist()), axis=1) + eqtl_df = eqtl_df.merge(df[ + ['snp', 'gencode_id', 'cell_lines', 'hic_score', 'cell_line_hic_scores'] + ].drop_duplicates(), on=['snp', 'gencode_id'], how='left') # Get SNP-gene distance eqtl_df['sid_chr'] = 'chr'+eqtl_df['sid_chr'].astype(str) eqtl_df['distance'] = eqtl_df.apply( - lambda row: get_snp_gene_distance(row), axis=1) + lambda row: get_snp_gene_distance(row), axis=1) eqtl_df['interaction_type'] = eqtl_df.apply( - lambda row: label_cis(row), axis=1) + lambda row: label_cis(row), axis=1) eqtl_df = eqtl_df.rename( - columns={'sid_chr': 'snp_chr', - 'sid_pos': 'snp_locus', - 'pval': 'eqtl_pval', - 'b': 'beta', - 'b_se': 'beta_se'}) - if not no_afc: + columns={'sid_chr': 'snp_chr', + 'sid_pos': 'snp_locus', + 'pval': 'eqtl_pval', + 'b': 'beta', + 'b_se': 'beta_se'}) + + if not no_afc and not pchic: cols = ['snp', 'gencode_id', 'gene', 'tissue', 'adj_pval', 'log2_aFC', 'log2_aFC_lower', 'log2_aFC_upper', 'maf', 'interaction_type', 'hic_score'] - else: + elif no_afc and not pchic: cols = ['snp', 'gencode_id', 'gene', 'tissue', 'adj_pval', 'beta', 'beta_se', 'maf', 'interaction_type', 'hic_score'] + + elif not no_afc and pchic: + cols = ['snp', 'gencode_id', 'gene', 'tissue', 'adj_pval', + 'log2_aFC', 'log2_aFC_lower', 'log2_aFC_upper', 'maf', + 'interaction_type', 'chicago_scores'] + elif no_afc and pchic: + cols = ['snp', 'gencode_id', 'gene', 'tissue', 'adj_pval', + 'beta', 'beta_se', 'maf', 'interaction_type', 'chicago_scores'] + + if output_format == 'short': eqtl_df[cols].to_csv(os.path.join(output_dir, 'significant_eqtls.txt'), sep='\t', columns=cols, index=False) @@ -129,11 +158,18 @@ def produce_summary( } extremes_df = pd.DataFrame(extremes_df) - eqtl_df = eqtl_df.merge( - extremes_df.reset_index(), left_on=['gencode_id'], right_on=['Name']) - cols += ['cell_lines', 'cell_line_hic_scores', 'expression', - 'max_expressed_tissue', 'max_expression', - 'min_expressed_tissue', 'min_expression'] + if pchic: + eqtl_df = eqtl_df.merge( + extremes_df.reset_index(), left_on=['gencode_id'], right_on=['Name']) + cols += ['cell_lines', 'N_reads_per_cell_line', 'chicago_scores', 'expression', + 'max_expressed_tissue', 'max_expression', + 'min_expressed_tissue', 'min_expression'] + else: + eqtl_df = eqtl_df.merge( + extremes_df.reset_index(), left_on=['gencode_id'], right_on=['Name']) + cols += ['cell_lines', 'cell_line_hic_scores', 'expression', + 'max_expressed_tissue', 'max_expression', + 'min_expressed_tissue', 'min_expression'] eqtl_df = eqtl_df[cols] eqtl_df.to_csv(os.path.join(output_dir, 'significant_eqtls.txt'), sep='\t', columns=cols, index=False) diff --git a/data_preparation/create_index_pchic_sqlite3_db.py b/data_preparation/create_index_pchic_sqlite3_db.py new file mode 100644 index 0000000..3478cdc --- /dev/null +++ b/data_preparation/create_index_pchic_sqlite3_db.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python + +import pandas as pd +import sqlite3 +import sys +import os + + +def create_index_db(db_path): + + for path in db_path: + conn = None + with sqlite3.connect(path) as conn: + cursor = conn.cursor() + print("Creating index on tables in database...") + cursor.execute('''CREATE INDEX idx_{}_{} ON + interactions (p_fid,oe_fid)'''.format('pfid','oefid')) + + return + +db_fp = '/mnt/projects/codes3d/lib/pchic/HindIII/pchic_data/' + +db_path = [] +for cell_lines in os.listdir(db_fp): + cell_lines_fp = os.path.join(db_fp,cell_lines) + #print(cell_lines_fp) + for dbs in os.listdir(cell_lines_fp): + dbs_fp = os.path.join(cell_lines_fp,dbs) + db_path.append(dbs_fp) + +create_index_db(db_path) diff --git a/data_preparation/init_pchic_gene_tables.py b/data_preparation/init_pchic_gene_tables.py new file mode 100644 index 0000000..4fa43d8 --- /dev/null +++ b/data_preparation/init_pchic_gene_tables.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python + +import pandas as pd +import os +import sys +import sqlite3 +import argparse +from sqlalchemy import create_engine +import csv + + +def read_promoter_file(promoters): + col = ['frag_chr', 'frag_start', 'frag_end', 'frag_id', 'gene', 'project'] + gene_prom = pd.read_csv(promoters, sep = "\t", names = col, + usecols = ['frag_id', 'gene', 'project']) + return gene_prom + +def read_genebed(genebed): + cols = ['chr', 'start', 'end', 'gene', 'gencode_id'] + gene_bed = pd.read_csv(genebed, sep = "\t", names = cols) + return gene_bed + +def build_gene_table(gene_prom, gene_bed, table, db_url, outfile): + merge_tables = pd.merge(gene_prom, gene_bed, how='inner', on='gene', sort=False) + gene_file= merge_tables[['chr', 'start', 'end', 'gene', 'gencode_id', + 'frag_id', 'project']].drop_duplicates() + db = create_engine(db_url, echo=False) + gene_file.to_csv(outfile, sep='\t', header=True, index=False) + gene_file.to_sql(table, con=db, if_exists='replace', index=False) + print("Creating index...") + db.execute("CREATE INDEX idx_{}_frag_id ON {}(frag_id)".format(table, table)) + db.execute('''CREATE INDEX idx_{}_fragid_gencodeid on + {}(frag_id,gencode_id)'''.format(table, table)) + db.execute('''CREATE INDEX idx_{}_fragid_gencodeid_project on + {}(frag_id,gencode_id,project)'''.format(table, table)) + db.execute('''CREATE INDEX idx_{}_gencodeid_project on + {}(gencode_id,project)'''.format(table, table)) + db.execute('''CREATE INDEX idx_{}_gencode_id on + {}(gencode_id)'''.format(table, table)) + print("Done.") + return + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description = + 'Build gene tables for promoter capture Hi-C data analysis') + parser.add_argument('-p', dest = 'pro_file', nargs='+', required = True, + help = '''Provide a path to the promoter fragment file (no header). + Expected columns in the file: frag_chr, frag_start, frag_end, frag_id, gene_name, + project. Multiple files should be speparated by a space''') + parser.add_argument('-g', dest = 'gene_bed', required = True, + help='''Path to the gene bed file with extension .bed. + e.g. /mnt/projects/codes3d/lib/reference_files/genes/gene_reference.bed''') + parser.add_argument("-t", dest = 'table', required = True, + help='Name of table e.g. gene_lookup_pchic_hindiii.') + parser.add_argument("-u", dest = 'db_url', required = True, + help="URL of database e.g. postgresql://user:password@hostname/database") + parser.add_argument("-o", dest = 'outfile', required = True, + help="absolute path to the output file") + args = parser.parse_args() + + promoters = [] + if len(args.pro_file) > 1: + for pfile in args.pro_file: + p_df = read_promoter_file(pfile) + promoters.append(p_df) + promoters = pd.concat(promoters) + else: + promoters = read_promoter_file(args.pro_file[0]) + + genebed = read_genebed(args.gene_bed) + + build_gene_table(promoters, + genebed, + args.table, + args.db_url, + args.outfile) + + diff --git a/data_preparation/init_pchic_meta.py b/data_preparation/init_pchic_meta.py new file mode 100755 index 0000000..76759bc --- /dev/null +++ b/data_preparation/init_pchic_meta.py @@ -0,0 +1,31 @@ +#! /usr/bin/env python + +import os +import pandas as pd +import sys +from sqlalchemy import create_engine + +if __name__ == '__main__': + ''' + Create table for meta info on PCHi-C libraries. + To be run after placing PCHi-C libraries in + '../lib/pchic//pchic_data//.db' + ''' + pchic_fp = os.path.join(os.path.dirname(__file__), '/mnt/projects/codes3d/lib/pchic') + enzyme_dict = [] + for enzyme in os.listdir(pchic_fp): + enzyme_fp = os.path.join(pchic_fp, enzyme) + for library in os.listdir(os.path.join(enzyme_fp, 'pchic_data')): + library_fp = os.path.join(enzyme_fp, 'pchic_data', library) + #for library in os.listdir(os.path.join(project_fp)): + #library_fp = os.path.join(project_fp,library) + for rep in os.listdir(library_fp): + enzyme_dict.append((rep.split('.')[0], enzyme, library)) + replicates = pd.DataFrame(enzyme_dict, columns=['replicate', 'enzyme', 'library']) + libraries = replicates.groupby( + ['library','enzyme']).size().reset_index(name='rep_count') + from_file = pd.read_csv('/mnt/projects/users/sgok603/codes3d/meta_info/meta_pchic.txt', sep="\t") + libraries = libraries.merge(from_file, how='left', on=['library','enzyme']).drop_duplicates() + db = create_engine( + 'postgresql://codes3d:codes3d@127.0.0.1/codes3d_commons', echo=False) + libraries.drop_duplicates().to_sql('meta_pchic', con=db, if_exists='replace') diff --git a/data_preparation/init_pchic_sqlite_db.py b/data_preparation/init_pchic_sqlite_db.py new file mode 100644 index 0000000..b48bc65 --- /dev/null +++ b/data_preparation/init_pchic_sqlite_db.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python + +import pandas as pd +import sqlite3 +from sqlite3 import Error +import os +import sys +import argparse +import time + +def read_digested_genome(genome): + + cols = ['fchr','fstart','fend','fid'] + if genome.endswith('.rmap'): + genome_df = pd.read_csv(genome, sep = "\t", header = None, names=cols) + #print(genome_df) + return genome_df + + +def read_chicago_output_ibeds(inputs): + + inter_cols = ['b_chr','b_start','b_end','b_name','oe_chr','oe_start','oe_end','oe_name','n_reads','score'] + input_df = pd.read_csv(inputs, sep = "\t", skiprows=1, names=inter_cols) + return input_df + + +def build_pchic_db(inputs, genome, tissue_dirs, reps_dirs, out_dir, db_tblname): + + genome_df = read_digested_genome(genome) + input_df = read_chicago_output_ibeds(inputs) + + bait_df = pd.merge(input_df, genome_df, how='inner', left_on=['b_chr','b_start', 'b_end'], + right_on=['fchr','fstart', 'fend'], sort = False) + bait_short_df = bait_df[['b_chr','b_start','b_end','b_name','fid']] + bait_short_df.columns = ['p_chr','p_start','p_end','p_name','p_fid'] + + oe_df = pd.merge(input_df, genome_df, how='inner', left_on=['oe_chr','oe_start', 'oe_end'], + right_on=['fchr','fstart', 'fend'], sort = False) + oe_short_df = oe_df[['oe_chr','oe_start','oe_end','oe_name','fid','n_reads','score']] + oe_short_df.columns = ['oe_chr','oe_start','oe_end','oe_name','oe_fid','n_reads','score'] + + bait_oe_df = pd.concat([bait_short_df, oe_short_df], axis=1, sort=False) + + tissue_dir_basename = os.path.splitext(os.path.basename(tissue_dirs))[0] + rep_dir_basename = os.path.splitext(os.path.basename(reps_dirs))[0] + db_file = rep_dir_basename+'.db' + out_db_fp = os.path.join(out_dir,tissue_dir_basename) + + if not os.path.exists(out_db_fp): + os.makedirs(out_db_fp, exist_ok=True) + os.chmod(out_db_fp, 0o777) + + db_fp = os.path.join(out_db_fp,db_file) + conn = None + with sqlite3.connect(db_fp) as conn: + cursor = conn.cursor() + print("Creating database {}".format(db_file)) + bait_oe_df.to_sql(db_tblname, if_exists='replace', con=conn, index=False) + print("Creating index on tables in database {}...".format(db_file)) + cursor.execute('''CREATE INDEX idx_{}_baits + ON {} (p_fid)'''.format(rep_dir_basename, db_tblname)) + cursor.execute('''CREATE INDEX idx_{}_oes + ON {} (oe_fid)'''.format(rep_dir_basename, db_tblname)) + cursor.execute('''CREATE INDEX idx_{}_pfid_oefid + ON {} (p_fid,oe_fid)'''.format(rep_dir_basename, db_tblname)) + print("Done!") + + +def parse_args(): + + parser = argparse.ArgumentParser( + description = "Build SQLite libraries of PCHi-C data for CoDeS3D.") + + parser.add_argument( + '-r', '--rmap-file', required = True, + help = '''Bowtie digested hg38 build genome file with an extension (.rmap) + which was used to run CHiCAGO experiment.''') + + parser.add_argument( + '-d', '--pchic-dir', required = True, + help = '''Provide a superdirectory (i.e. parent) containing tissues subdirectories + that contain replicates subdirectories (e.g. parent/tissues/replicates_{1..n}/) + where chicago output .ibed file is residing.''') + + parser.add_argument( + '-o', '--output_dir', required = True, + help = '''Provide path to the output directory to save PCHi-C databases. + The script will automatically create an output directory if the provided + directory doesn't exist in the path''') + + return parser.parse_args() + +if __name__ == '__main__': + + args = parse_args() + start_time = time.time() + + if not (args.rmap_file or args.pchic_dir) and args.output_dir: + message = '''Missing --rmap-file, --pchic-dir and --output_dir. + See init_pchic_sqlite_db.py -h for more details.''' + print(message) + sys.exit() + + if args.rmap_file == None: + message = '''Missing parameter... Genome fragment file with extension .rmap is required. + For more details init_pchic_sqlite_db.py -h''' + print(message) + sys.exit() + else: + print("\nUsing rmap file in: '%s'\n" %args.rmap_file) + + if args.pchic_dir == None: + message = '''Missing parameter... Directory containing .ibed files from Chicago is required. + For more details init_pchic_sqlite_db.py -h''' + print(message) + sys.exit() + + if args.output_dir == None: + + message = '''Missing parameter... Output directory to save databases is required. + For more details init_pchic_sqlite_db.py -h''' + print(message) + sys.exit() + + if os.path.exists(args.output_dir): #check is directory exists + out_dir = args.output_dir + print("PCHi-C databases will be stored in the output directory: '%s'\n" %out_dir) + else: + out_dir = os.makedirs(args.output_dir, exist_ok = True) + print("PCHi-C databases will be stored in the output directory: '%s'\n" %out_dir) + + db_tblname = 'interactions' + for tissues in os.listdir(args.pchic_dir): + #print(tissues) + tissue_dirs = os.path.join(args.pchic_dir, tissues) + #print(tissue_dirs) + for reps in os.listdir(tissue_dirs): + #print(reps) + reps_dirs = os.path.join(tissue_dirs, reps) + #print(reps_dirs) + for files in os.listdir(reps_dirs): + files_path = os.path.join(reps_dirs, files) + #print(files_path) + if not files_path.endswith('.ibed'): + pass + else: + input_fp = os.path.join(reps_dirs,files_path) + build_pchic_db(input_fp, + args.rmap_file, + tissue_dirs, + reps_dirs, + out_dir, + db_tblname) + print("All done.") + print('Total time elapsed: {:.2f} mins'.format((time.time()-start_time)/60)) diff --git a/data_preparation/init_variant_fragment_table_pchic.py b/data_preparation/init_variant_fragment_table_pchic.py new file mode 100755 index 0000000..a9800f1 --- /dev/null +++ b/data_preparation/init_variant_fragment_table_pchic.py @@ -0,0 +1,98 @@ +import pybedtools +import pandas as pd +import sys +from tqdm import tqdm +import argparse +import os +import time +from sqlalchemy import create_engine + + +def find_variant_fragments(query_fp, fragment_fp, output_fp, db_url, table): + """ Create table of variant fragment IDs. """ + fragment_bed = pybedtools.BedTool(fragment_fp) + desc = 'Finding variant fragments...' + bar_format = '{desc}: {n_fmt} {unit}' + t = tqdm(total=0, unit='variants', desc=desc, disable=False, bar_format=bar_format) + chunksize = 200000 + cols = ['chr', 'frag_id', 'id', 'variant_id'] + pybedtools.set_tempdir(os.path.dirname(output_fp)) + db = create_engine(db_url, echo=False) + idx = 1 + for query_df in pd.read_csv( + query_fp, sep='\t', compression='gzip', chunksize=chunksize, + usecols=['chr', 'variant_pos', 'variant_id']): + query_df = query_df.rename(columns={'variant_pos': 'end'}) + query_df['id'] = range(idx, idx+len(query_df)) + query_df['start'] = query_df['end'] - 1 + query_bed = pybedtools.BedTool.from_dataframe( + query_df[['chr', 'start', 'end', 'id', 'variant_id']]) + df = fragment_bed.intersect(query_bed, wb=True) + df = df.to_dataframe( + names=['frag_chr', 'frag_start', 'frag_end', 'frag_id', + 'chrom', 'start', 'end', 'id', 'variant_id']) + cols = ['frag_id', 'chrom', 'id'] + mode = 'w' if t.total == 0 else 'a' + header = 'True' if t.total == 0 else None + df[cols].to_csv(output_fp, sep='\t', header=header, mode=mode) + if table: + if_exists = 'replace' if t.total == 0 else 'append' + df[cols].to_sql(table, con=db, if_exists=if_exists, index=False) + idx += len(query_df) + t.total += len(query_df) + t.update(len(query_df)) + t.close() + pybedtools.cleanup(remove_all=True) + if not table: + return + create_index(table, db) + +def create_index(table, db): + print('Creating index for {}...'.format(table)) + db.execute( + '''CREATE INDEX idx_{}_frag_chrom ON {}(frag_id, chrom) + '''.format(table, table)) + db.execute( + '''CREATE INDEX idx_{}_id ON {}(id) + '''.format(table, table)) + db.execute( + '''CREATE INDEX idx_{}_fragid ON {}(frag_id) + '''.format(table, table)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Create a PostgreSQL table of variant or gene fragment IDs.") + parser.add_argument( + '-f', '--fragments', required=True, + help='The filepath to the fragment BED file') + parser.add_argument( + '-v', '--variants', + help=('The filepath to the variant BED file. Must contain chr, ', + 'variant_pos, and variant_id columns')) + parser.add_argument( + "-t", "--table", + help='Name of table e.g. variant_lookup_pchic_hindiii.') + parser.add_argument( + "-u", "--db-url", required=False, + help='URL of database e.g postgresql://user:password@hostname/database') + parser.add_argument( + "-c", "--config", + default=os.path.join(os.path.dirname(__file__), + "../docs/codes3d.conf"), + help="The configuration file to be used for this " + + "run (default: codes3d.conf)") + parser.add_argument( + '-o', '--output', required=True, + help='The filepath to output file.') + args = parser.parse_args() + start_time = time.time() + if not args.variants: + sys.exit('Variants or gene bed file missing.') + if os.path.exists(args.output): + os.remove(args.output) + if args.variants: + find_variant_fragments( + args.variants, args.fragments, args.output, + args.db_url, args.table) + print('Time elapsed: {:.2f} mins'.format( + (time.time() - start_time)/60)) diff --git a/data_preparation/tidy_promoter_baitmap_file.py b/data_preparation/tidy_promoter_baitmap_file.py new file mode 100644 index 0000000..3a85a0c --- /dev/null +++ b/data_preparation/tidy_promoter_baitmap_file.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +import pandas as pd +import argparse +import time +import os +import sys + + +def tidy_promoter_file(baits,out_file,p_id): + cols = ['chr', 'start', 'end', 'frag_id', 'genes'] + bait_df = pd.read_csv(baits, sep="\t", names = cols).drop_duplicates() + bait_df['genes'] = bait_df['genes'].str.replace(', ', ',') + bait_df['genes'] = bait_df['genes'].str.split(',') + bait_df_new = bait_df.explode('genes') + bait_df_new = bait_df_new.dropna() + bait_df_new = bait_df_new.drop_duplicates() + bait_df_new['project'] = p_id + bait_df_new.sort_values(by = 'genes', inplace=True, ascending=True) + bait_df_new.to_csv(out_file, sep='\t', header=False, index=False) + +def parse_args(): + parser = argparse.ArgumentParser( + description="Tidy up promoter file for CoDeS3D pipeline") + + parser.add_argument( + '-f', '--bait-file', required = True, + help = '''Pass a promoter information file with an extension .baitmap + which was used to do CHiCAGO analysis''') + + parser.add_argument( + '-o', '--out-fp', required = True, + help = '''Provide path to an output file''') + + parser.add_argument( + '-p', '--project-id', type=str, required = True, + help = '''Project ID will be appended into the output file + which will be used while running the CoDeS3D pipeline. + Example: 'InkyungJung2019'. ''') + return parser.parse_args() + + +if __name__ == '__main__': + + args = parse_args() + if not (args.bait_file or args.out_fp or args.project_id): + message = '''One or more of the required parameters are missing. + Use tidy_promoter_baitmap_file.py -h for more details.''' + print(message) + sys.exit() + + tidy_promoter_file(args.bait_file, + args.out_fp, + args.project_id) + + diff --git a/docs/codes3d.conf b/docs/codes3d.conf index ada7dc8..28e0085 100755 --- a/docs/codes3d.conf +++ b/docs/codes3d.conf @@ -12,9 +12,9 @@ SNP_BED_DIR: %(SNP_REF_DIR)shuman_9606_b151_GRCh38p7 RS_MERGE_ARCH: %(SNP_REF_DIR)shuman_9606_b151_GRCh38p7_RsMergeArch.pairs.bcp.gz HIC_DIR: %(libdir)shic +PCHIC_DIR: %(libdir)spchic EQTL_DATA_DIR: %(libdir)seqtls - [GTEx] eqtl_dir=../lib/eqtls/GTEx/ GENE_FP: %(eqtl_dir)sGTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz From cbdd3ebdd05b40445f9edac1e23a92df62b297fe Mon Sep 17 00:00:00 2001 From: sreemol-gokuladhas Date: Wed, 22 Jun 2022 09:26:20 +1200 Subject: [PATCH 2/2] added a flag to list available pchi-c libraries --- codes3d/codes3d.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/codes3d/codes3d.py b/codes3d/codes3d.py index d8965fd..e2ce4b7 100755 --- a/codes3d/codes3d.py +++ b/codes3d/codes3d.py @@ -230,6 +230,14 @@ def list_hic_libraries(db): print('{}\t{}'.format(idx + 1, row['library'])) db.dispose() +def list_pchic_libraries(db): + sql = '''SELECT library, tissue FROM meta_pchic''' + with db.connect() as con: + df = pd.read_sql(sql, con=db) + for idx, row in df.drop_duplicates().iterrows(): + print('{}\t{}'.format(idx + 1, row['library'])) + db.dispose() + def list_enzymes(db): sql = '''SELECT DISTINCT enzyme FROM meta_hic''' with db.connect() as con: @@ -618,6 +626,9 @@ def parse_args(): parser.add_argument( '--list-hic-libraries', action='store_true', default=False, help='List available Hi-C libraries.') + parser.add_argument( + '--list-pchic-libraries', action='store_true', default=False, + help='List available PCHi-C libraries.') parser.add_argument( '--match-tissues', action='append', nargs=argparse.REMAINDER, default=None, help='''Try to match eQTL and Hi-C tissue types using space-separated @@ -701,6 +712,9 @@ def validate_args(args, commons_db): if args.list_hic_libraries: list_hic_libraries(commons_db) sys.exit() + if args.list_pchic_libraries: + list_pchic_libraries(commons_db) + sys.exit() if args.list_enzymes: list_enzymes(commons_db) sys.exit()