From 2359f2e60455e9be76dce7c30fb374bdc1e1527d Mon Sep 17 00:00:00 2001 From: Tayaza Fadason Date: Thu, 17 Feb 2022 15:40:24 +1300 Subject: [PATCH] Update dependencies. --- codes3d/init_eqtl_genotype_db.py | 63 ---------- codes3d/init_eqtl_meta.py | 17 --- codes3d/init_fragment_db.py | 161 ------------------------- codes3d/init_gene_variant_db.py | 118 ------------------ codes3d/init_hic_db_postgres.py | 166 -------------------------- codes3d/init_hic_db_sqlite.py | 197 ------------------------------- codes3d/init_hic_meta.py | 30 ----- docs/requirements.txt | 16 --- environment.yaml | 2 +- 9 files changed, 1 insertion(+), 769 deletions(-) delete mode 100755 codes3d/init_eqtl_genotype_db.py delete mode 100755 codes3d/init_eqtl_meta.py delete mode 100755 codes3d/init_fragment_db.py delete mode 100755 codes3d/init_gene_variant_db.py delete mode 100755 codes3d/init_hic_db_postgres.py delete mode 100755 codes3d/init_hic_db_sqlite.py delete mode 100755 codes3d/init_hic_meta.py delete mode 100755 docs/requirements.txt diff --git a/codes3d/init_eqtl_genotype_db.py b/codes3d/init_eqtl_genotype_db.py deleted file mode 100755 index 0096be3..0000000 --- a/codes3d/init_eqtl_genotype_db.py +++ /dev/null @@ -1,63 +0,0 @@ -#! /usr/bin/env python - -import torch -from tensorqtl import * -from tensorqtl import genotypeio, trans -from tensorqtl import tensorqtl -import argparse -import pandas as pd -from sqlalchemy import create_engine -import time -from tqdm import tqdm -sys.path.insert( - 1, os.path.join(os.path.abspath(os.path.dirname(__file__)), 'tensorqtl')) - -'''Create genotype table for eQTL mapping by tensorqtl -''' - - -def create_genotype_table(genotype_df, db_url): - table = 'genotype' - desc = 'Building genotype table...' - bar_format = '{desc}: {n_fmt} {unit}' - t = tqdm(total=0, unit='variants', desc=desc, disable=False, - bar_format=bar_format) - chunksize = 50000 - chunks = [genotype_df[i:i+chunksize] - for i in range(0, len(genotype_df), chunksize)] - db = create_engine(db_url, echo=False) - for df in chunks: - if_exists = 'replace' if t.total == 0 else 'append' - df.to_sql(table, con=db, if_exists=if_exists) - t.total += len(df) - t.update(len(df)) - t.close() - # db.execute('''CREATE INDEX idx_{}_snp ON {} - # (snp)'''.format(table, table)) - - -def read_genotype(plink_prefix): - ''' Read WGS VCF ''' - pr = genotypeio.PlinkReader(plink_prefix) - genotype_df = pd.DataFrame(pr.load_genotypes(), - #index=pr.bim['snp'], - columns=pr.fam['iid']) - return genotype_df - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='') - parser.add_argument( - '-p', '--plink-prefix', required=True, - help='''Filepath to plink generated files of genotypes but without the - extensions''') - parser.add_argument( - '-u', '--db-url', required=True, - help='URL of database e.g postgresql://user:password@hostname/database') - args = parser.parse_args() - start_time = time.time() - genotype_df = read_genotype(args.plink_prefix) - print(genotype_df) - create_genotype_table(genotype_df, args.db_url) - print('Time elapsed: {:.2f} mins'.format( - (time.time() - start_time)/60)) diff --git a/codes3d/init_eqtl_meta.py b/codes3d/init_eqtl_meta.py deleted file mode 100755 index ee84163..0000000 --- a/codes3d/init_eqtl_meta.py +++ /dev/null @@ -1,17 +0,0 @@ -#! /usr/bin/env python - -import os -import pandas as pd -import sys -#import psycopg2 -from sqlalchemy import create_engine - -""" -Create PostgreSQL table of list of eQTL projects. -Requires a meta_eqtls.txt -""" -eqtl_fp = os.path.join(os.path.dirname(__file__), '../lib/meta_info/meta_eqtls.txt') -eqtls = pd.read_csv(eqtl_fp, sep='\t') -db = create_engine( - 'postgres://codes3d:codes3d@127.0.0.1/codes3d_commons', echo=False) -eqtls.to_sql('meta_eqtls', con=db, if_exists='replace', index=False) diff --git a/codes3d/init_fragment_db.py b/codes3d/init_fragment_db.py deleted file mode 100755 index c6caea2..0000000 --- a/codes3d/init_fragment_db.py +++ /dev/null @@ -1,161 +0,0 @@ -import pybedtools -import pandas as pd -import sys -from tqdm import tqdm -import argparse -import os -import time -from sqlalchemy import create_engine -import codes3d - - -def find_variant_fragments( - query_fp, - fragment_fp, - output_fp, - db_url, - table): - """ Create table of variant fragment IDs. - """ - fragment_bed = pybedtools.BedTool(fragment_fp) - desc = 'Finding variant fragments...' - bar_format = '{desc}: {n_fmt} {unit}' - t = tqdm(total=0, unit='variants', desc=desc, disable=False, - bar_format=bar_format) - chunksize = 200000 - cols = ['chr', 'frag_id', 'id', 'variant_id'] - pybedtools.set_tempdir(os.path.dirname(output_fp)) - db = create_engine(db_url, echo=False) - idx = 1 - for query_df in pd.read_csv( - query_fp, sep='\t', compression='gzip', chunksize=chunksize, - usecols=['chr', 'variant_pos', 'variant_id']): - query_df = query_df.rename(columns={'variant_pos': 'end'}) - query_df['id'] = range(idx, idx+len(query_df)) - query_df['start'] = query_df['end'] - 1 - query_bed = pybedtools.BedTool.from_dataframe( - query_df[['chr', 'start', 'end', 'id', 'variant_id']]) - df = fragment_bed.intersect(query_bed, wb=True) - df = df.to_dataframe( - names=['frag_chr', 'frag_start', 'frag_end', 'frag_id', - 'chrom', 'start', 'end', 'id', 'variant_id']) - cols = ['frag_id', 'chrom', 'id'] - mode = 'w' if t.total == 0 else 'a' - header = 'True' if t.total == 0 else None - df[cols].to_csv(output_fp, sep='\t', header=header, mode=mode) - if table: - if_exists = 'replace' if t.total == 0 else 'append' - df[cols].to_sql(table, con=db, if_exists=if_exists, index=False) - idx += len(query_df) - t.total += len(query_df) - t.update(len(query_df)) - t.close() - pybedtools.cleanup(remove_all=True) - if not table: - return - create_index(table, db) - - -def find_gene_fragments( - query_fp, - fragment_fp, - output_fp, - db_url, - table): - """ Create table of gene fragment IDs. - """ - fragment_bed = pybedtools.BedTool(fragment_fp) - desc = 'Finding gene fragments...' - bar_format = '{desc}: {n_fmt} {unit}' - t = tqdm(total=0, unit='genes', desc=desc, disable=False, - bar_format=bar_format) - chunksize = 2000 - pybedtools.set_tempdir(os.path.dirname(output_fp)) - db = create_engine(db_url, echo=False) - for query_df in pd.read_csv( - query_fp, sep='\t', compression='infer', chunksize=chunksize - ): - #query_df.columns = ['id', 'chr', 'start', 'end', 'gene', 'gencode_id'] - query_bed = pybedtools.BedTool.from_dataframe( - query_df[['chrom', 'start', 'end', 'id', 'gencode_id']]) - # print(query_bed) - df = fragment_bed.intersect(query_bed, wb=True) - df = df.to_dataframe( - names=['frag_chr', 'frag_start', 'frag_end', 'frag_id', - 'chrom', 'start', 'end', 'id', 'gencode_id']) - cols = ['frag_id', 'chrom', 'id'] - mode = 'w' if t.total == 0 else 'a' - header = 'True' if t.total == 0 else None - df[cols].to_csv(output_fp, sep='\t', header=header, - mode=mode, index=False) - if table: - if_exists = 'replace' if t.total == 0 else 'append' - df[cols].to_sql(table, con=db, if_exists=if_exists, index=False) - t.total = len(query_df) - t.update(len(query_df)) - t.close() - pybedtools.cleanup(remove_all=True) - if not table: - return - create_index(table, db) - - -def create_index(table, db): - print('Creating index for {}...'.format(table)) - db.execute( - '''CREATE INDEX idx_{}_frag_chrom ON {}(frag_id, chrom) - '''.format(table, table)) - db.execute( - '''CREATE INDEX idx_{}_id ON {}(id) - '''.format(table, table)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Create s PostgreSQL table of variant or gene fragment IDs.") - parser.add_argument( - '-f', '--fragments', required=True, - help='The filepath to the fragment BED file with chrom, start, end, fragment ID.') - parser.add_argument( - '-v', '--variants', - help='''The filepath to the variant lookup file. Must contain chr, - variant_pos, and variant_id columns''') - parser.add_argument( - '-g', '--genes', - help='''The filepath to a sorted gene reference file containing - id, chrom, start, end, name, and gencode_id columns. - Note that the id indicates the sorting (chrom, start) - order of the file starting at 1.''') - parser.add_argument( - '-t', '--table', - help='Name of table e.g. variant_lookup_mboi.') - parser.add_argument( - '-u', '--db-url', required=True, - help='URL of database e.g postgresql://user:password@hostname/database') - parser.add_argument( - '-c', '--config', - default=os.path.join(os.path.dirname(__file__), - '../docs/codes3d.conf'), - help='''The configuration file to be used for this - run (default: codes3d.conf)''') - - parser.add_argument( - '-o', '--output', required=True, - help='The filepath to output file.') - args = parser.parse_args() - c = codes3d.CODES3D(args.config) - start_time = time.time() - if not args.variants and not args.genes: - sys.exit('Variants or gene bed file missing.') - if os.path.exists(args.output): - os.remove(args.output) - if args.variants: - find_variant_fragments( - args.variants, args.fragments, args.output, - args.db_url, args.table) - elif args.genes: - find_gene_fragments( - args.genes, args.fragments, args.output, - args.db_url, args.table) - print('Time elapsed: {:.2f} mins'.format( - (time.time() - start_time)/60)) diff --git a/codes3d/init_gene_variant_db.py b/codes3d/init_gene_variant_db.py deleted file mode 100755 index 610aa5c..0000000 --- a/codes3d/init_gene_variant_db.py +++ /dev/null @@ -1,118 +0,0 @@ -import pandas as pd -import sys -from tqdm import tqdm -import argparse -import os -import time -from sqlalchemy import create_engine -import codes3d - - -def build_variants_table( - variant_fp, - db_url, - table): - """Create a table of genotype. - genotype(snp, sample_id..) - Indexes: - "ix_genotype_snp" btree (snp) - """ - desc = 'Building variants table...' - bar_format = '{desc}: {n_fmt} {unit}' - t = tqdm(total=0, unit='variants', desc=desc, disable=False, - bar_format=bar_format) - chunksize = 200000 - cols = ['chr', 'frag_id', 'id', 'variant_id'] - db = create_engine(db_url, echo=False) - idx = 1 - for df in pd.read_csv( - variant_fp, sep='\t', compression='gzip', chunksize=chunksize): - df = df.rename(columns={ - 'chr': 'chrom', - 'variant_pos': 'locus', - 'rs_id_dbSNP151_GRCh38p7': 'rsid' - } - ) - df['id'] = range(idx, idx+len(df)) - cols = ['id', 'rsid', 'chrom', 'locus', 'variant_id', 'ref', 'alt', 'num_alt_per_site'] - if_exists = 'replace' if t.total == 0 else 'append' - df[cols].to_sql(table, con=db, if_exists=if_exists, index=False) - idx += len(df) - t.total += len(df) - t.update(len(df)) - t.close() - db.execute('''CREATE INDEX idx_{}_rsid_chrom_locus ON {} - (rsid, chrom, locus)'''.format( - '{}_{}'.format(table, 'rsid_chrom_locus'), table)) - db.execute('''CREATE INDEX idx_{}_rsid_chrom_locus ON {} - (rsid, chrom, locus)'''.format( - '{}_{}'.format(table, 'id'), table)) - - -def build_gene_table( - gene_fp, - db_url, - table): - """Create a table of gene info. - gene(id, chrom, start, end, name, gencode_id) - Indexes: - "genes_pk" PRIMARY KEY, btree (id) - "genes_name_id" btree (id) - "genes_name_index" btree (name) - """ - desc = 'Building gene table...' - bar_format = '{desc}: {n_fmt} {unit}' - t = tqdm(total=0, unit='genes', desc=desc, disable=False, - bar_format=bar_format) - chunksize = 2000 - db = create_engine(db_url, echo=False) - for df in pd.read_csv( - query_fp, sep='\t', compression='infer', chunksize=chunksize - ): - cols = ['frag_id', 'chrom', 'id'] - if_exists = 'replace' if t.total == 0 else 'append' - df[cols].to_sql(table, con=db, if_exists=if_exists, index=False) - t.total = len(df) - t.update(len(df)) - t.close() - create_index(table, db) - - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Build database tables gene and variant fragments') - parser.add_argument( - '-v', '--variants', - help='The filepath to the variant lookup file') - parser.add_argument( - '-g', '--genes', - help='''The filepath to a sorted gene reference file containing - 'id, chrom, start, end, name, and gencode_id columns'. - Note that the id indicates the sorting (chrom, start) - order of the file starting at 1.''') - parser.add_argument( - '-t', '--table', - help='Name of table e.g. variant_lookup_mboi.') - parser.add_argument( - '-u', '--db-url', required=True, - help='''URL of database e.g postgresql://user:password@hostname/database - This is usually codes3d_commons''') - parser.add_argument( - '-c', '--config', - default=os.path.join(os.path.dirname(__file__), - '../docs/codes3d.conf'), - help='''The configuration file to be used for this - run (default: codes3d.conf)''') - - args = parser.parse_args() - c = codes3d.CODES3D(args.config) - start_time = time.time() - if not args.variants and not args.genes: - sys.exit('Variants or gene bed file missing.') - if args.variants: - build_variants_table(args.variants, args.db_url, args.table) - elif args.genes: - build_gene_table(args.genes, args.db_url, args.table) - print('Time elapsed: {:.2f} mins'.format( - (time.time() - start_time)/60)) diff --git a/codes3d/init_hic_db_postgres.py b/codes3d/init_hic_db_postgres.py deleted file mode 100755 index 3c23720..0000000 --- a/codes3d/init_hic_db_postgres.py +++ /dev/null @@ -1,166 +0,0 @@ -#!usr/bin/env python -import os -import pandas as pd -from tqdm import tqdm -import argparse -from sqlalchemy import create_engine -from sqlalchemy_utils import create_database, database_exists -import sys -import codes3d -import time -import gzip -import sqlite3 - -def create_db(db_auth, db_url, tblspace): - if tblspace and db_auth.startswith('postgres'): - db = create_engine(os.path.join(db_auth, 'codes3d'), echo=False) - sql = '''CREATE DATABASE {} TABLESPACE {}''' - conn = db.connect() - conn.execute('commit') - conn.execute(sql.format(db_url, tblspace)) - conn.close() - elif db_auth.startswith('sqlite'): - create_database(os.path.join(db_auth, db_url)) - if database_exists(os.path.join(db_auth, db_url)): - print('{} database created.'.format(db_url)) - else: - print('FATAL: {} database not created.'.format(db_url)) - sys.exit() - - -def build_hic_tables(cell_line, enzyme, fp, db_auth, mapq_cutoff, tblspace): - start_time = time.time() - db_url = 'hic_{}_{}'.format(enzyme.lower(), cell_line.lower()) - #db_url = os.path.join(db_auth, 'hic_{}_{}'.format(enzyme.lower(), cell_line.lower())) - if db_auth.startswith('sqlite'): - db_url = db_url + '.db' - if not database_exists(os.path.join(db_auth, db_url)): - create_db(db_auth, db_url, tblspace) - db = create_engine(os.path.join(db_auth, db_url), echo=False) - rep = os.path.basename(fp).split('.')[0] - rep = rep[:rep.rfind('_merged')].split('_')[-1] - desc = ' * {} replicate {}'.format(cell_line, rep) - bar_format = '' - t = tqdm(total=0, desc=desc, bar_format=bar_format) - chunksize = 200000 - idx = 1 - #db = create_engine(db_url, echo=False) - for df in pd.read_csv( - fp, sep=' ', header=None, - compression='gzip', chunksize=chunksize): - ''' - Columns are for short format described - https://github.com/aidenlab/juicer/wiki/Pre#medium-format-most-common - ''' - cols = [ - 'readname', 'str1', 'chr1', 'pos1', 'frag1', 'str2', 'chr2', - 'pos2', 'frag2', 'mapq1', 'mapq2'] - df.columns = cols - df = (df.drop(['mapq1', 'mapq2'], axis=1).join( # Remove non-numeric MAPQ as seen in MitDNA - df[['mapq1', 'mapq2']].apply(pd.to_numeric, errors='coerce'))) - df = df[df[['mapq1', 'mapq2']].notnull().all(axis=1)] - self_ligation = ( # Filter self-ligating fragments - (df['chr1'] == df['chr2']) &\ - (df['frag1'] == df['frag2'])) - low_mapq = ( # Filter low quality reads - (df['mapq1'] >= mapq_cutoff) & (df['mapq2'] >= mapq_cutoff)) - db_cols = ['chr1', 'frag1', 'chr2', 'frag2'] - df = df[~self_ligation & low_mapq][db_cols] - df_reverse = df.rename( # Reversing to make SQL queries faster - columns={'chr1': 'chr2', - 'frag1': 'frag2', - 'chr2': 'chr1', - 'frag2': 'frag1'}) - df = pd.concat([ - df.sort_values(by=['chr1', 'frag1']), - df_reverse.sort_values(by=['chr1', 'frag1'])]) - df[['chr1', 'chr2']] = df[['chr1', 'chr2']].astype(str) - patched_chrom = (df['chr1'].str.contains('_')) - df = df[~patched_chrom] - if_exists = 'replace' if t.total == 0 else 'append' - df.to_sql( - rep.lower(), con=db, if_exists=if_exists, index=False) - t.total += len(df) - t.update(len(df)) - t.close() - print(' * Creating index...') - # '+', '-' throw a sqlite3.OperationalError - cell_index = cell_line.lower().replace('+', '') - cell_index = cell_index.replace('-', '_') - db.execute( - '''CREATE INDEX idx_{}_{}_chr1_frag1 ON {} - (chr1, frag1)'''.format( - cell_index, - rep.lower(), - rep.lower())) - print(' Time elapsed: {:.2f} mins'.format((time.time()-start_time)/60)) - - - - -def parse_args(): - parser = argparse.ArgumentParser( - description='Build PostgreSQL database of Hi-C libraries for CoDeS3D.') - parser.add_argument( - '--hic-dir', required=True, - help='''The directory containing (subdirectories containing) - Hi-C non_dups.txt.gz files. Directories should be named by the cell lines.''') - parser.add_argument( - '-m', '--mapq-cutoff', default=30, - help='Mininum mapping quality score (mapq) of hic reads. (Default: 30)') - parser.add_argument( - "-a", "--db-auth", required=True, - help='''Connection string of database e.g 'postgresql://codes3d:@localhost' - or 'sqlite:///' ''') - parser.add_argument( - "-e", "--enzyme", required=True, - help='The restriction enzyme used to prepare the Hi-C library') - parser.add_argument( - "-t", "--tablespace", default=None, - help='Tablespace of database.') - return parser.parse_args() - - - -if __name__ == '__main__': - args = parse_args() - start_time = time.time() - if args.db_auth.startswith('postgres') and not args.tablespace: - sys.exit('''PostgreSQL databases require a TABLESPACE to be set. - Provide the name of the tablespace with the '--tablespace' flag. - See https://www.postgresqltutorial.com/postgresql-create-tablespace/''') - logfile = '/tmp/codes3d_db.log' - done = [] - if os.path.exists(logfile): - f = open(logfile, 'r') - done = [lib.rstrip() for lib in f] - print('Building...') - for first_level in os.listdir(args.hic_dir): - if os.path.isdir(os.path.join(args.hic_dir, first_level)): - for second_level in os.listdir(os.path.join(args.hic_dir, first_level)): - cell_line = first_level - fp = os.path.join(args.hic_dir, first_level, second_level) - if fp in done: - continue - build_hic_tables(cell_line, args.enzyme, fp, args.db_auth, args.mapq_cutoff, - args.tablespace) - with open(logfile, 'a') as f: - f.write('{}\n'.format(fp)) - elif os.path.isfile(os.path.join(args.hic_dir, first_level)): - cell_line = '' - if args.hic_dir.endswith('/'): - cell_line = args.hic_dir.strip().split('/')[-2] - else: - cell_line = args.hic_dir.strip().split('/')[-1] - fp = os.path.join(args.hic_dir,first_level) - if fp in done: - continue - build_hic_tables(cell_line, args.enzyme, fp, args.db_auth, args.mapq_cutoff, - args.tablespace) - with open(logfile, 'a') as f: - f.write('{}\n'.format(fp)) - - os.remove(logfile) - print('Total time elapsed: {:.2f} mins'.format((time.time()-start_time)/60)) - - diff --git a/codes3d/init_hic_db_sqlite.py b/codes3d/init_hic_db_sqlite.py deleted file mode 100755 index f371189..0000000 --- a/codes3d/init_hic_db_sqlite.py +++ /dev/null @@ -1,197 +0,0 @@ -#!usr/bin/env python -import os -import pandas as pd -from tqdm import tqdm -import argparse -from sqlalchemy import create_engine -from sqlalchemy_utils import create_database, database_exists -import sys -import codes3d - - -def build_hic_tables(cell_line, enzyme, fp, db_auth, mapq_cutoff, tblspace): - db_url = os.path.join(db_auth, 'hic_{}_{}'.format(enzyme.lower(), cell_line.lower())) - if db_url.startswith('sqlite'): - db_url = db_url + '.db' - db = create_engine(db_url, echo=False) - if not database_exists(db_url): - create_database(db_url) - if tblspace and db_url.startswith('postgres'): - db.execute( - '''ALTER DATABASE {} SET TABLESPACE {}; '''.format( - 'hic_{}_{}'.format(db_url), tblspace)) - print('{} database created.'.format(db_url)) - rep = os.path.basename(fp).split('.')[0] - rep = rep[:rep.rfind('_merged')].split('_')[-1] - desc = ' * {} replicate {}'.format(cell_line, rep) - bar_format = '' - t = tqdm(total=0, desc=desc, bar_format=bar_format) - chunksize = 200000 - idx = 1 - #db = create_engine(db_url, echo=False) - for df in pd.read_csv( - fp, sep=' ', header=None, - compression='gzip', chunksize=chunksize): - ''' - Columns are for short format described - https://github.com/aidenlab/juicer/wiki/Pre#medium-format-most-common - ''' - cols = [ - 'readname', 'str1', 'chr1', 'pos1', 'frag1', 'str2', 'chr2', - 'pos2', 'frag2', 'mapq1', 'mapq2'] - df.columns = cols - self_ligation = ( # Filter self-ligating fragments - (df['chr1'] == df['chr2']) &\ - (df['frag1'] == df['frag2'])) - low_mapq = ( # Filter low quality reads - (df['mapq1'] >= mapq_cutoff) & (df['mapq2'] >= mapq_cutoff)) - db_cols = ['chr1', 'frag1', 'chr2', 'frag2'] - df = df[~self_ligation & low_mapq][db_cols] - df_reverse = df.rename( # Reversing to make SQL queries faster - columns={'chr1': 'chr2', - 'frag1': 'frag2', - 'chr2': 'chr1', - 'frag2': 'frag1'}) - df = pd.concat([ - df.sort_values(by=['chr1', 'frag1']), - df_reverse.sort_values(by=['chr1', 'frag1'])]) - df[['chr1', 'chr2']] = df[['chr1', 'chr2']].astype(str) - patched_chrom = (df['chr1'].str.contains('_')) - df = df[~patched_chrom] - if_exists = 'replace' if t.total == 0 else 'append' - df.to_sql( - rep.lower(), con=db, if_exists=if_exists, index=False) - idx += len(df) - t.total += len(df) - t.update(len(df)) - t.close() - db.execute( - '''CREATE INDEX idx_{}_{}_chr1_frag1 ON {} - (chr1, frag1)'''.format(cell_line, rep.lower(), rep.lower())) - - - -def build_hic_index( - input_hic_fp, - output_fp=None, - chr1_col=3, - chr2_col=7, - frag1_col=5, - frag2_col=9, - mapq1_col=10, - mapq2_col=11, - mapq_cutoff=30, - do_not_tidy_up=False): - if not output_fp: - if not input_hic_fp.rfind('.') == -1: - output_fp = input_hic_fp[:input_hic_fp.rfind('.')] + ".db" - else: - output_fp = input_hic_fp + ".db" - if os.path.isfile(output_fp): - overwrite = input( - "WARNING: Overwriting existing HiC database (%s). Continue? [y/N] " % - output_fp) - if not upsert.lower() == 'y': - print("Did not overwrite existing HiC database.") - return - os.remove(output_fp) - # Do line count for progress meter - print("Determining table size...") - hic_table = open(input_hic_fp, 'r') - lines = 0 - for i in hic_table: - lines += 1 - hic_table.close() - lines = lines // 100 * 100 # Get an approximation - do_linecount = not lines == 0 - - int_db = sqlite3.connect(output_fp) - interactions = int_db.cursor() - interactions.execute( - "CREATE TABLE IF NOT EXISTS interactions (chr1 text, fragment1 text, chr2 text, fragment2 text)") - interactions.execute( - "CREATE INDEX IF NOT EXISTS i_1 ON interactions (chr1, fragment1)") - chr1_col = int(chr1_col) - 1 - chr2_col = int(chr2_col) - 1 - frag1_col = int(frag1_col) - 1 - frag2_col = int(frag2_col) - 1 - mapq1_col = int(mapq1_col) - 1 - mapq2_col = int(mapq2_col) - 1 - with open(input_hic_fp, 'r') as rao_table: - print("Indexing HiC interaction table...") - for i, line in enumerate(rao_table): - if do_linecount: - if i % (lines / 100) == 0: - print("\tProcessed %d%%..." % - ((float(i) / float(lines)) * 100)) - interaction = line.strip().split(' ') - if int( - interaction[mapq1_col]) >= mapq_cutoff and int( - interaction[mapq2_col]) >= mapq_cutoff: - chr1 = interaction[chr1_col] - frag1 = interaction[frag1_col] - chr2 = interaction[chr2_col] - frag2 = interaction[frag2_col] - interactions.execute( - "INSERT INTO interactions VALUES(?,?,?,?)", [ - chr1, frag1, chr2, frag2]) - interactions.execute( - "INSERT INTO interactions VALUES(?,?,?,?)", [ - chr2, frag2, chr1, frag1]) - int_db.commit() - interactions.close() - print("Done indexing HiC interaction table.") - if not do_not_tidy_up: - print("Tidying up...") - os.remove(input_hic_fp) - - - -def parse_args(): - parser = argparse.ArgumentParser( - description='Build SQLite database of Hi-C libraries for CoDeS3D.') - parser.add_argument( - '--hic-dir', required=True, - help='The directory containing Hi-C non_dups.txt.gz files.') - parser.add_argument( - '-m', '--mapq-cutoff', default=30, - help='Mininum mapping quality score (mapq) of hic reads.') - parser.add_argument( - "-a", "--db-auth", required=True, - help='URL of database e.g postgresql://user:password@hostname') - parser.add_argument( - "-e", "--enzyme", required=True, - help='The restriction enzyme used to prepare the Hi-C library') - parser.add_argument( - "-t", "--tablespace", default=None, - help='Tablespace of database.') - parser.add_argument("-c", "--config", - default=os.path.join(os.path.dirname(__file__), - "../docs/codes3d.conf"), - help="The configuration file to be used for this " + - "run (default: codes3d.conf)") - - return parser.parse_args() - - - -if __name__ == '__main__': - args = parse_args() - - print('Building...') - for first_level in os.listdir(args.hic_dir): - if os.path.isdir(os.path.join(args.hic_dir, first_level)): - for second_level in os.listdir(os.path.join(args.hic_dir, first_level)): - cell_line = first_level - fp = os.path.join(args.hic_dir,first_level, second_level) - build_hic_tables(cell_line, args.enzyme, fp, args.db_auth, args.mapq_cutoff, args.tablespace) - elif os.path.isfile(os.path.join(args.hic_dir, first_level)): - cell_line = '' - if args.hic_dir.endswith('/'): - cell_line = args.hic_dir.strip().split('/')[-2] - else: - cell_line = args.hic_dir.strip().split('/')[-1] - fp = os.path.join(args.hic_dir,first_level) - build_hic_tables(cell_line, args.enzyme, fp, args.db_auth, args.mapq_cutoff, args.tablespace) - - diff --git a/codes3d/init_hic_meta.py b/codes3d/init_hic_meta.py deleted file mode 100755 index 76a078a..0000000 --- a/codes3d/init_hic_meta.py +++ /dev/null @@ -1,30 +0,0 @@ -#! /usr/bin/env python - -import os -import pandas as pd -import sys -from sqlalchemy import create_engine - - -if __name__=='__main__': - """Create table for meta info on Hi-C libraries.""" - hic_fp = os.path.join(os.path.dirname(__file__), '../lib/hic') - enzyme_dict = [] - for enzyme in os.listdir(hic_fp): - enzyme_fp = os.path.join(hic_fp, enzyme) - for library in os.listdir(os.path.join(enzyme_fp, 'hic_data')): - library_fp = os.path.join(enzyme_fp, 'hic_data', library) - for rep in os.listdir(library_fp): - enzyme_dict.append((rep.split('.')[0], enzyme, library)) - replicates = pd.DataFrame( - enzyme_dict, columns=['replicate', 'enzyme', 'library']) - libraries = replicates.groupby( - ['library', 'enzyme']).size().reset_index(name='rep_count') - from_file = pd.read_csv('/mnt/projects/codes3d/lib/meta_hic.txt', sep='\t') - libraries = libraries.merge( - from_file, how='left', on=['library', 'enzyme'] - ).drop_duplicates() - - db = create_engine( - 'postgres://codes3d:codes3d@127.0.0.1/codes3d_commons', echo=False) - libraries.drop_duplicates().to_sql('meta_hic', con=db, if_exists='replace') diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100755 index 03b5bf4..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -# python3 version >3.5.2 -pandas == 1.0.5 -numpy == 1.19.1 -pandas-plink == 2.0.4 -psycopg2 == 2.8.5 -requests == 2.24.0 -pybedtools == 0.8.1 -biopython == 1.77 -statsmodels == 0.11.1 -tqdm == 4.48.0 -scikits.bootstrap == 1.0.1 -torch -pandas_plink == 2.0.4 -sqlalchemy == 1.3.18 -sqlalchemy_utils == 0.36.8 -more_itertools == 8.2.0 diff --git a/environment.yaml b/environment.yaml index 71ab651..952def2 100755 --- a/environment.yaml +++ b/environment.yaml @@ -18,4 +18,4 @@ dependencies: - statsmodels #=0.11.1 - pytorch #=1.9.0 - tqdm #=4.48.0 - + - more-itertools