Skip to content

Commit

Permalink
Merge pull request #3 from sreemol-gokuladhas/V3.1
Browse files Browse the repository at this point in the history
Added new feature: PCHi-C data analysis
  • Loading branch information
tayaza authored Jun 21, 2022
2 parents 94e857b + cbdd3eb commit 94024bf
Show file tree
Hide file tree
Showing 12 changed files with 1,335 additions and 435 deletions.
272 changes: 175 additions & 97 deletions codes3d/codes3d.py

Large diffs are not rendered by default.

264 changes: 176 additions & 88 deletions codes3d/genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,50 @@
from itertools import repeat


def get_gene_fragments(gene_df, restriction_enzymes, db):
def get_gene_fragments(gene_df, restriction_enzymes, db, pchic=False):
db.dispose()
gene_df = gene_df.sort_values(by=['id'])
fragment_df = []
chunksize = 1000
chunks = [gene_df[i:i+chunksize]
for i in range(0, gene_df.shape[0], chunksize)]
for enzyme in restriction_enzymes:
table = 'gene_lookup_{}'
if pchic:
table = 'gene_lookup_pchic_{}'
else:
table = 'gene_lookup_{}'

if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites
table = table.format('mboi')
else:
table = table.format(enzyme.lower())
sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}'''

if pchic:
sql = '''SELECT * FROM {} WHERE gencode_id = '{}' '''
else:
sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}'''

with db.connect() as con:
for chunk in chunks:
df = pd.read_sql(sql.format(
table, chunk['id'].min(), chunk['id'].max()), con=con)
df['enzyme'] = enzyme
fragment_df.append(df)
fragment_df = pd.concat(fragment_df)
gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom'])
if not pchic:
df = pd.read_sql(sql.format(
table, chunk['id'].min(), chunk['id'].max()), con=con)
df['enzyme'] = enzyme
fragment_df.append(df)
else:
for _, row in chunk.iterrows():
df = pd.read_sql(sql.format(
table, row['gencode_id']), con=con)
df['enzyme'] = enzyme
fragment_df.append(df)
fragment_df = pd.concat(fragment_df).drop_duplicates()
if pchic:
gene_df = pd.merge(gene_df, fragment_df, how='inner',
left_on = ['chrom','start','end','name','gencode_id'],
right_on = ['chr', 'start', 'end', 'gene', 'gencode_id'])
else:
gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom'])

return gene_df


Expand Down Expand Up @@ -70,139 +92,201 @@ def process_snp_genes(
def find_snp_genes(
chunk_df,
enzyme,
enzyme_genes):
enzyme_genes,
pchic=False):
db.dispose()
chunk_df = chunk_df.sort_values(by=['fragment'])
chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0]
table = 'gene_lookup_{}'
if pchic:
#celline = chunk_df['cell_line'].drop_duplicates().to_list()
table = 'gene_lookup_pchic_{}'
else:
chunk_df = chunk_df.sort_values(by=['fragment'])
chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0]
table = 'gene_lookup_{}'

if enzyme in ['MboI', 'DpnII']: # MboI and DpnII have the same restriction sites
table = table.format('mboi')
else:
table = table.format(enzyme.lower())
chunk_df['fragment'] = chunk_df['fragment'].astype(int)
sql = ''' SELECT * FROM {0}
JOIN genes on {0}.id=genes.id
WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}'''
sql = sql.format(table, chrom,

if pchic:
inter_df_ls = chunk_df['inter_fid'].unique().tolist()
if len(inter_df_ls) > 1:
inter_df_ls = tuple(inter_df_ls)
sql = '''SELECT * FROM {} WHERE frag_id IN {}'''.format(table, inter_df_ls)
else:
inter_df_ls = (inter_df_ls[0])
sql = '''SELECT * FROM {} WHERE frag_id = {}'''.format(table, inter_df_ls)
else:
chunk_df['fragment'] = chunk_df['fragment'].astype(int)
sql = ''' SELECT * FROM {0}
JOIN genes on {0}.id=genes.id
WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}'''
sql = sql.format(table, chrom,
chunk_df['fragment'].min(), chunk_df['fragment'].max())
df = pd.DataFrame()
with db.connect() as con:
df = pd.read_sql_query(sql, con)
if df.empty:
return
df = df.loc[:, ~df.columns.duplicated()]
df = df.rename(
columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr',
'start': 'gene_start', 'end': 'gene_end'})
chunk_df = pd.merge(chunk_df, df, how='inner',

if pchic:
df = df.rename(
columns={'chr': 'gene_chr', 'start': 'gene_start', 'end': 'gene_end'})
chunk_df = pd.merge(chunk_df, df, how= 'inner', sort=False, left_on='inter_fid',
right_on='frag_id')
else:
df = df.loc[:, ~df.columns.duplicated()]
df = df.rename(
columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr',
'start': 'gene_start', 'end': 'gene_end'})
chunk_df = pd.merge(chunk_df, df, how='inner',
left_on=['fragment_chr', 'fragment'], right_on=['gene_chr', 'frag_id'])

enzyme_genes.append(chunk_df)


def fetch_hic_libs(db):
def fetch_3dgi_libs(db, pchic=False):
with db.connect() as con:
hic_libs = pd.read_sql_query(
'SELECT library, enzyme, rep_count FROM meta_hic',
con=con)
return hic_libs.drop_duplicates()

if pchic:
_3dgi_libs = pd.read_sql_query(
'SELECT library, enzyme, rep_count FROM meta_pchic', con=con)
else:
_3dgi_libs = pd.read_sql_query(
'SELECT library, enzyme, rep_count FROM meta_hic', con=con)

return _3dgi_libs.drop_duplicates()

def get_gene_by_id(
snp_df,
inter_df,
_db,
logger,
):
logger.write('Identifying genes interacting with SNPs in...')
pchic=False):
if pchic:
logger.write('Identifying gene promoters interacting with SNPs in...')
else:
logger.write('Identifying genes interacting with SNPs in...')
global db
db = _db
start_time = time.time()
enzymes = inter_df['enzyme'].drop_duplicates().tolist()
all_genes_df = []
#db = create_engine(db_url, echo=False, poolclass=NullPool)
hic_libs = fetch_hic_libs(db)
hic_libs = hic_libs.rename(columns={'rep_count': 'cell_line_replicates'})
_3dgi_libs = fetch_3dgi_libs(db, pchic)
_3dgi_libs = _3dgi_libs.rename(columns={'rep_count': 'cell_line_replicates'})
for enzyme in enzymes:
manager = multiprocessing.Manager()
num_processes = int(min(16, multiprocessing.cpu_count()/2))
enzyme_genes = manager.list()
enzyme_df = []
with multiprocessing.Pool(processes=num_processes) as pool:
df = inter_df[inter_df['enzyme'] == enzyme]
snp_interactions = [
df[df['fragment_chr'] == chrom]
for chrom in df['fragment_chr'].drop_duplicates().to_list()
]
desc = ' * Hi-C libraries restricted with {}'.format(enzyme)
if pchic:
df_subset = df[['p_fid', 'oe_fid', 'n_reads', 'score',
'query_type', 'query_fragment', 'replicate', 'cell_line', 'enzyme']]
df_subset['inter_fid'] = np.where(df_subset['query_fragment'] ==
df_subset['p_fid'], df_subset['oe_fid'], df_subset['p_fid'])
snp_interactions = [df_subset[df_subset['cell_line'] == celline]
for celline in df_subset['cell_line'].to_list()
]
desc = ' * PCHi-C libraries restricted with {}'.format(enzyme)
else:
snp_interactions = [
df[df['fragment_chr'] == chrom]
for chrom in df['fragment_chr'].drop_duplicates().to_list()
]
desc = ' * Hi-C libraries restricted with {}'.format(enzyme)
bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}'
'''
for snp in snp_interactions:
find_snp_genes(snp,
enzyme,
enzyme_genes,
db)
'''
for _ in tqdm.tqdm(pool.istarmap(
find_snp_genes,
zip(snp_interactions,
repeat(enzyme),
repeat(enzyme_genes))),
repeat(enzyme_genes),
repeat(pchic))),
total=len(snp_interactions), desc=desc, unit='batches',
ncols=80, bar_format=bar_format):
pass

for df in enzyme_genes:
enzyme_df.append(df)
enzyme_df = pd.concat(enzyme_df)
enzyme_df = enzyme_df.merge(
hic_libs, how='left',
_3dgi_libs, how='left',
left_on=['cell_line', 'enzyme'], right_on=['library', 'enzyme'])
enzyme_df['interactions'] = enzyme_df.groupby(
['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
'fragment'].transform('count')
df = enzyme_df[
['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']
].drop_duplicates()
df['replicates'] = df.groupby(
['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
'replicate'].transform('count')
enzyme_df = enzyme_df.merge(
df, how='left',
on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'])
enzyme_df = enzyme_df.drop(
columns=['fragment_chr', 'fragment',
if pchic:
enzyme_df = enzyme_df.drop(
columns=['p_fid','oe_fid','library','frag_id','cell_line_replicates'])
else:
enzyme_df['interactions'] = enzyme_df.groupby(
['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
'fragment'].transform('count')
df = enzyme_df[
['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']
].drop_duplicates()
df['replicates'] = df.groupby(
['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
'replicate'].transform('count')
enzyme_df = enzyme_df.merge(
df, how='left',
on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'])
enzyme_df = enzyme_df.drop(
columns=['fragment_chr', 'fragment',
'frag_id', 'library', 'replicate']
)
)
enzyme_df = enzyme_df.drop_duplicates()
all_genes_df.append(enzyme_df.drop_duplicates())
logger.write(' * Filtering gene interactions...')
if not pchic:
logger.write(' * Filtering gene interactions...')
all_genes_df = pd.concat(all_genes_df)
all_genes_df = all_genes_df.merge(
snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'],
right_on=['fragment', 'chrom', 'enzyme'], how='inner'
) # .drop_duplicates()
all_genes_df['sum_cell_lines'] = all_genes_df.groupby(
['snp', 'gene'])['cell_line'].transform('count')
all_genes_df['sum_interactions'] = all_genes_df.groupby(
['snp', 'gene'])[
'interactions'].transform('sum')
all_genes_df['sum_replicates'] = all_genes_df.groupby(
['snp', 'gene'])[
'replicates'].transform('sum')
condition = (
(all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) &
(all_genes_df['sum_replicates'] < 2) &
(all_genes_df['sum_cell_lines'] < 2))
gene_df = all_genes_df[~condition].drop_duplicates()
cols = ['snp', 'chrom', 'locus', 'variant_id',

if pchic:
all_genes_df = all_genes_df.drop_duplicates()
df = all_genes_df[['query_fragment','gencode_id','cell_line','n_reads',
'score']].drop_duplicates()
df['N_reads'] = df.groupby(['query_fragment','gencode_id','cell_line'])[
'n_reads'].transform('sum')
df['Score'] = df.groupby(['query_fragment','gencode_id','cell_line','N_reads'])[
'score'].transform('mean').round(2)
df = df[['query_fragment','gencode_id','cell_line','N_reads','Score']].drop_duplicates()
all_genes_df = all_genes_df.merge(
df, on=['query_fragment','gencode_id','cell_line'],
how='left'
).drop(['n_reads','score','inter_fid'], axis=1).drop_duplicates()
all_genes_df = all_genes_df.merge(
snp_df, left_on=['query_fragment','enzyme'],
right_on=['fragment','enzyme'],
how='inner'
).drop_duplicates()
gene_df = all_genes_df[['snp', 'chrom', 'locus', 'variant_id',
'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end',
'interactions', 'replicates', 'cell_line', 'cell_line_replicates',
'sum_interactions', 'sum_replicates', 'sum_cell_lines']
'N_reads', 'Score', 'cell_line']].drop_duplicates()
else:
all_genes_df = all_genes_df.merge(
snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'],
right_on=['fragment', 'chrom', 'enzyme'], how='inner'
) # .drop_duplicates()
all_genes_df['sum_cell_lines'] = all_genes_df.groupby(
['snp', 'gene'])['cell_line'].transform('count')
all_genes_df['sum_interactions'] = all_genes_df.groupby(
['snp', 'gene'])[
'interactions'].transform('sum')
all_genes_df['sum_replicates'] = all_genes_df.groupby(
['snp', 'gene'])[
'replicates'].transform('sum')
condition = (
(all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) &
(all_genes_df['sum_replicates'] < 2) &
(all_genes_df['sum_cell_lines'] < 2))
gene_df = all_genes_df[~condition].drop_duplicates()
cols = ['snp', 'chrom', 'locus', 'variant_id',
'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end',
'interactions', 'replicates', 'cell_line', 'cell_line_replicates',
'sum_interactions', 'sum_replicates', 'sum_cell_lines']
logger.write(' Time elasped: {:.2f} mins.'.format(
(time.time() - start_time)/60))
return gene_df[cols].drop_duplicates()

if pchic:
return gene_df
else:
return gene_df[cols].drop_duplicates()

def get_gene_by_position(df, db):
gene_df = []
Expand Down Expand Up @@ -288,6 +372,7 @@ def get_gene_info(
output_dir,
db,
logger,
pchic = False,
suppress_intermediate_files=False):
enzymes = hic_df['enzyme'].drop_duplicates().tolist()
gene_df = []
Expand Down Expand Up @@ -340,7 +425,10 @@ def get_gene_info(
len(omitted_genes))
msg = msg + ':\n\t{}'.format(', '.join(omitted_genes))
logger.write(msg)

gene_df = get_gene_fragments(gene_df, enzymes, db)
gene_df = get_gene_fragments(gene_df, enzymes, db, pchic)
gene_df = gene_df.rename(columns={'frag_id': 'fragment'})
if pchic:
gene_df = gene_df.drop(['chr','gene'], axis=1)
else:
pass
return(gene_df)
Loading

0 comments on commit 94024bf

Please sign in to comment.