From bb3c4d04726455a1e9798eba7aaacef0c2c6c95c Mon Sep 17 00:00:00 2001
From: sreemol-gokuladhas <sreemol.gokuladhas@auckland.ac.nz>
Date: Sat, 18 Jun 2022 20:07:30 +1200
Subject: [PATCH 1/2] Added new feature: PCHi-C data analysis

---
 codes3d/codes3d.py                            | 258 +++++----
 codes3d/genes.py                              | 264 ++++++---
 codes3d/interactions.py                       | 173 ++++--
 codes3d/snps.py                               | 502 ++++++++++++------
 codes3d/summary.py                            | 108 ++--
 .../create_index_pchic_sqlite3_db.py          |  31 ++
 data_preparation/init_pchic_gene_tables.py    |  78 +++
 data_preparation/init_pchic_meta.py           |  31 ++
 data_preparation/init_pchic_sqlite_db.py      | 155 ++++++
 .../init_variant_fragment_table_pchic.py      |  98 ++++
 .../tidy_promoter_baitmap_file.py             |  56 ++
 docs/codes3d.conf                             |   2 +-
 12 files changed, 1321 insertions(+), 435 deletions(-)
 create mode 100644 data_preparation/create_index_pchic_sqlite3_db.py
 create mode 100644 data_preparation/init_pchic_gene_tables.py
 create mode 100755 data_preparation/init_pchic_meta.py
 create mode 100644 data_preparation/init_pchic_sqlite_db.py
 create mode 100755 data_preparation/init_variant_fragment_table_pchic.py
 create mode 100644 data_preparation/tidy_promoter_baitmap_file.py

diff --git a/codes3d/codes3d.py b/codes3d/codes3d.py
index ef4e1b4..d8965fd 100755
--- a/codes3d/codes3d.py
+++ b/codes3d/codes3d.py
@@ -12,7 +12,6 @@
 from sqlalchemy.pool import NullPool
 import tqdm
 import statsmodels.stats.multitest as multitest
-
 import snps
 import genes
 import interactions
@@ -52,10 +51,10 @@ def parse_tissues(user_tissues, match_tissues, eqtl_project, db):
             else:
                 matched_df.append(u_df)
                 matched_tissues.append(u_tissue)
-        error_msg = 'Program aborting:\n\t{}\nnot found in database.'
+        error_msg = 'Program aborting: {} not found in database.'
         if (len(matched_df) == 0 or len(not_tissues) > 0) and len(to_omit) == 0:
             print(error_msg.format('\n\t'.join(not_tissues)))
-            print('\nPlease use one of the following. ' +
+            print('\nPlease use one of the following with -t flag. ' +
                   'Tissue names are case sensitive:')
             list_eqtl_tissues(db)
             sys.exit()
@@ -93,7 +92,8 @@ def parse_hic(
         include_cell_line,
         exclude_cell_line,
         restriction_enzymes,
-        db):
+        db,
+        pchic=False):
     ''' user parameters -r, -n and -x.
 
     Args:
@@ -103,9 +103,13 @@ def parse_hic(
         exclude_cell_line: space-delimited list of cell_lines from -x
     Returns:
         hic_df: a dataframe columns(library, enzyme, rep_count)
+        pchic_df: a dataframe columns(library, enzyme, rep_count)
     '''
-
-    sql = '''SELECT library, tags, enzyme, rep_count FROM meta_hic'''
+    if not args.pchic:
+        sql = '''SELECT library, tags, enzyme, rep_count FROM meta_hic'''
+    else:
+        sql = '''SELECT library, tags, enzyme, rep_count FROM meta_pchic'''
+    
     df = pd.DataFrame()
     with db.connect() as con:
         df = pd.read_sql(sql, con=con)
@@ -122,6 +126,9 @@ def parse_hic(
                 (df['tags'].str.contains(
                     r'\b{}\b'.format(u_tissue), case=False))
             ]
+            #u_df = df[
+            #        df['library'].str.contains(u_tissue,case=False) | 
+            #        df['tags'].str.contains(u_tissue,case=False)]
             if u_df.empty:
                 if u_tissue.startswith(u_tissue):
                     to_omit.append(u_tissue)
@@ -134,7 +141,7 @@ def parse_hic(
         if (len(matched_df) == 0 or len(not_tissues) > 0) and len(to_omit) == 0:
             print(error_msg.format('\n\t'.join(not_tissues)))
             print(('Use -t and -n to include specific eQTL tissues'
-                   ' and Hi-C libraries. Library names are case sensitive:'))
+                ' and Hi-C/PCHi-C libraries. Library names are case sensitive:'))
             sys.exit('\n\t{}'.format('\n\t'.join(df['library'].tolist())))
         if len(matched_df) == 0 and len(to_omit) > 0:
             hic_df = df
@@ -150,7 +157,6 @@ def parse_hic(
                     ~hic_df['tags'].str.contains(
                         r'\b{}\b'.format(to_omit[i][1:]), case=False)]
             hic_df = hic_df.drop_duplicates()
-
     elif include_cell_line and len(include_cell_line) > 0:
         validate_input(include_cell_line, df['library'])
         hic_df = df[df['library'].str.upper().isin(
@@ -170,7 +176,6 @@ def parse_hic(
     else:
         return df.drop_duplicates()
 
-
 def validate_input(input_params, params):
     input_params_upper = [c.upper() for c in input_params]
     not_found = set(input_params_upper).difference(
@@ -303,8 +308,8 @@ def map_non_spatial_eqtls(
         commons_db,
         covariates_dir,
         expression_dir,
-        logger):
-    
+        logger,
+        pchic = False):
     gene_info_df = None
     gene_list = None
     if not snp_df.empty:
@@ -312,12 +317,13 @@ def map_non_spatial_eqtls(
             print('No gene list given. Will map across all genes.')
         else:
             gene_info_df = genes.get_gene_info(
-                args.gene_list,
-                hic_df,
-                args.output_dir,
-                commons_db,
-                logger,
-                args.suppress_intermediate_files)
+                    args.gene_list,
+                    hic_df,
+                    args.output_dir,
+                    commons_db,
+                    logger,
+                    pchic,
+                    args.suppress_intermediate_files)
             gene_info_df = gene_info_df.rename(columns={
                 'name': 'gene',
                 'chrom': 'gene_chrom',
@@ -409,7 +415,7 @@ def map_non_spatial_eqtls(
         '''
     fp = os.path.join(args.output_dir, 'non_spatial_eqtls.txt') 
     eqtl_df[cols].to_csv(fp, sep='\t', index=False)
-    logger.write(f'Output written to {fp}')
+    logger.write(f'Output written to {fp}.')
     msg = 'Done.\nTotal time elasped: {:.2f} mins.'.format(
         (time.time() - start_time)/60)
     logger.write(msg)    
@@ -426,7 +432,8 @@ def map_spatial_eqtls(
         genotypes_fp,
         eqtl_project_db,
         covariates_dir,
-        expression_dir):
+        expression_dir,
+        pchic=False):
     gene_df = []
     eqtl_df = []
     batchsize = 2000
@@ -447,14 +454,15 @@ def map_spatial_eqtls(
             'snp',
             C.lib_dir,
             hic_df,
-            # batch_output_dir,
             args.num_processes,
-            logger)
+            logger,
+            pchic)
         batch_gene_df = genes.get_gene_by_id(
-            batch_snp_df,
-            batch_interactions_df,
-            commons_db,
-            logger)
+                batch_snp_df,
+                batch_interactions_df,
+                commons_db,
+                logger,
+                pchic)
         gene_df.append(batch_gene_df)
         if batch_gene_df.empty:
             continue
@@ -542,8 +550,6 @@ def __init__(self, config_fp):
         self.hic_restriction_enzymes = [
             e.strip() for e in os.listdir(self.hic_dir)]
 
-
-
 def parse_args():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -568,6 +574,9 @@ def parse_args():
     parser.add_argument(
         '-o', '--output-dir',
         help='The directory in which to output results.')
+    parser.add_argument(
+        '--pchic', action='store_true', default=False,
+        help='''Use this flag to use pchic datasets instead of hic.''') 
     parser.add_argument(
         '--multi-test', default = 'multi',
         help='''Options for BH multiple-testing: ['snp', 'tissue', 'multi'].
@@ -626,7 +635,7 @@ def parse_args():
         Default is all tissues from the GTEx project.
         Use 'codes3d.py --list-eqtl-tissues' for a list of installed tissues.''')
     parser.add_argument(
-        '--eqtl-project', type=str, default=None,
+        '--eqtl-project', type=str, default='GTEx',
         help='''The eQTL project to query. Default: GTEx.
         'use \'codes3d.py --list-eqtl-db\' to list available databases.''')
     parser.add_argument(
@@ -729,14 +738,33 @@ def log_settings(args, logger):
     logger.write(f'FDR threshold:\t{args.fdr_threshold}')
     logger.write(f'MAF threshold:\t{args.maf_threshold}')
     logger.write(f'Effect size:\t{effect_size}')
-    if not args.match_tissues and not args.include_cell_lines and \
-       not args.exclude_cell_lines:
+
+    if not args.pchic and not args.non_spatial:
+        logger.write('Using Hi-C datasets\n')
+    else:
+        if args.pchic and not args.non_spatial:
+            logger.write('Using PCHi-C datasets\n')
+
+    if not args.pchic and not args.match_tissues and not args.include_cell_lines and \
+        not args.exclude_cell_lines:
         if not args.non_spatial:
             logger.write('Hi-C libraries:\tAll libraries in database')
     else:
-        if not args.non_spatial:
+        if not args.pchic and not  args.non_spatial:
             logger.write('Hi-C libraries:\t{}'.format(
                 ', '.join(hic_df['library'].tolist())))
+
+
+    if args.pchic and not args.match_tissues and not args.include_cell_lines and \
+       not args.exclude_cell_lines:
+        if not args.non_spatial:
+            logger.write('PCHi-C libraries:\tAll libraries in database')
+    else:
+        if args.pchic and not args.non_spatial:
+            logger.write('PCHi-C libraries:\t{}'.format(
+                ', '.join(hic_df['library'].tolist())))
+
+
     if not args.tissues and not args.match_tissues:
         logger.write('eQTL tissues:\tAll tissues in database\n')
     else:
@@ -770,24 +798,42 @@ def log_settings(args, logger):
         args.include_cell_lines,
         args.exclude_cell_lines,
         args.restriction_enzymes,
-        commons_db)
-    if args.match_tissues and not args.non_spatial:
-        logger.write('Hi-C libraries:\t{}'.format(
+        commons_db,
+        args.pchic)
+    
+    if not args.pchic and args.match_tissues and not args.non_spatial:
+        logger.write('Using Hi-C libraries:\t{}'.format(
             ', '.join(hic_df['library'].tolist())))
         logger.write('\neQTL tissues:\t{}\n'.format(
             ', '.join(tissues['name'].tolist())))
-        
-        upsert = input(
+
+        upsert = input( 
             '''WARNING: We've tried to match your Hi-C and eQTL tissues above.
-            Continue? [y/N]'''
-        )
+            Continue? [y/n]''')
         if not upsert.lower() == 'y':
             print(('Use -t and -n to include specific eQTL tissues'
                    ' and Hi-C libraries'))
             sys.exit('Exiting.')
 
+
+    if args.pchic and args.match_tissues and not args.non_spatial:
+        logger.write('Using PCHi-C libraries:\t{}'.format(
+            ', '.join(hic_df['library'].tolist())))
+        logger.write('\neQTL tissues:\t{}\n'.format(
+            ', '.join(tissues['name'].tolist())))
+        
+        upsert = input(
+            '''WARNING: We've tried to match your PCHi-C and eQTL tissues above.
+            Continue? [y/n]''')
+        if not upsert.lower() == 'y':
+            print(('Use -t and -n to include specific eQTL tissues'
+                   ' and PCHi-C libraries'))
+            sys.exit('Exiting.')
+    
     log_settings(args, logger)
+    
     eqtl_project = tissues['project'].tolist()[0]
+    
     config = configparser.ConfigParser()
     config.read(args.config)
     covariates_dir = os.path.join(
@@ -803,6 +849,7 @@ def log_settings(args, logger):
     snp_df = pd.DataFrame()
     gene_df = []
     eqtl_df = []
+    
     if args.gene_input:
         gene_info_df = genes.get_gene_info(
             args.gene_input,
@@ -810,6 +857,7 @@ def log_settings(args, logger):
             args.output_dir,
             commons_db,
             logger,
+            args.pchic,
             args.suppress_intermediate_files)
         if args.non_spatial:
             map_non_spatial_eqtls(
@@ -824,33 +872,36 @@ def log_settings(args, logger):
                 commons_db,
                 covariates_dir,
                 expression_dir,
-                logger)
+                logger,
+                args.pchic)
         interactions_df = interactions.find_interactions(
             gene_info_df,
             'gencode_id',
             C.lib_dir,
             hic_df,
-            #args.output_dir,
             args.num_processes,
             logger,
+            args.pchic,
             args.suppress_intermediate_files)
         snp_df, gene_df, eqtl_df = snps.find_snps(
-            interactions_df,
-            gene_info_df,
-            tissues,
-            args.output_dir,
-            C,
-            genotypes_fp,
-            eqtl_project_db,
-            covariates_dir,
-            expression_dir,
-            args.pval_threshold,
-            args.maf_threshold,
-            args.fdr_threshold,
-            args.num_processes,
-            commons_db,
-            logger,
-            args.suppress_intermediate_files)
+                interactions_df,
+                gene_info_df,
+                tissues,
+                args.output_dir,
+                C,
+                genotypes_fp,
+                eqtl_project_db,
+                covariates_dir,
+                expression_dir,
+                args.pval_threshold,
+                args.maf_threshold,
+                args.fdr_threshold,
+                args.num_processes,
+                commons_db,
+                logger,
+                args.pchic,
+                args.suppress_intermediate_files)
+
         if gene_df.empty or snp_df.empty or eqtl_df.empty:
             logger.write('No eQTLs found.\nProgram exiting.')
             sys.exit()
@@ -859,27 +910,32 @@ def log_settings(args, logger):
                 args.output_dir, 'snps.txt'), sep='\t', index=False)
             gene_df.to_csv(os.path.join(
                 args.output_dir, 'genes.txt'), sep='\t', index=False)
+            eqtl_df.to_csv(os.path.join(
+                args.output_dir, 'eqtls.txt'), sep='\t', index=False)
+    
     if args.snp_input or args.snps_within_gene:
         interactions_df = []
         gene_df = []
         gene_info_df = None
         if args.snps_within_gene:
             gene_info_df = genes.get_gene_info(
-            args.snps_within_gene,
-            hic_df,
-            args.output_dir,
-            commons_db,
-            logger,
-            args.suppress_intermediate_files)
+                    args.snps_within_gene,
+                    hic_df,
+                    args.output_dir,
+                    commons_db,
+                    logger,
+                    args.pchic,
+                    args.suppress_intermediate_files)
         snp_df = snps.get_snp(
-            args.snp_input,
-            gene_info_df,
-            hic_df,
-            args.output_dir,
-            eqtl_project_db,
-            C.rs_merge_arch_fp,
-            logger,
-            args.suppress_intermediate_files)
+                args.snp_input,
+                gene_info_df,
+                hic_df,
+                args.output_dir,
+                eqtl_project_db,
+                C.rs_merge_arch_fp,
+                logger,
+                args.pchic,
+                args.suppress_intermediate_files)
         if not args.suppress_intermediate_files:
             snp_df.to_csv(os.path.join(
                 args.output_dir, 'snps.txt'), sep='\t', index=False)
@@ -887,41 +943,49 @@ def log_settings(args, logger):
                 
         if args.non_spatial:
             map_non_spatial_eqtls(
-                snp_df,
-                pd.DataFrame(),
-                tissues,
-                hic_df,
-                C,
-                args,
-                genotypes_fp,
-                eqtl_project_db,
-                commons_db,
-                covariates_dir,
-                expression_dir,
-                logger)
+                    snp_df,
+                    pd.DataFrame(),
+                    tissues,
+                    hic_df,
+                    C,
+                    args,
+                    genotypes_fp,
+                    eqtl_project_db,
+                    commons_db,
+                    covariates_dir,
+                    expression_dir,
+                    logger,
+                    args.pchic)
         else:
             gene_df, eqtl_df = map_spatial_eqtls(
-                snp_list,
-                C,
-                hic_df,
-                args,
-                logger,
-                commons_db,
-                tissues,
-                genotypes_fp,
-                eqtl_project_db,
-                covariates_dir,
-                expression_dir)
+                    snp_list,
+                    C,
+                    hic_df,
+                    args,
+                    logger,
+                    commons_db,
+                    tissues,
+                    genotypes_fp,
+                    eqtl_project_db,
+                    covariates_dir,
+                    expression_dir,
+                    args.pchic
+                    )
+    
     eqtl_df = multi_test_correction(eqtl_df, args.multi_test.lower())
-    logger.write('  * {} eQTL associations passed FDR <= {}.'.format(
-        len(eqtl_df[eqtl_df['adj_pval'] <= args.fdr_threshold]),
-        args.fdr_threshold))
+    #logger.write('  * {} eQTL associations passed FDR <= {}.'.format(
+    #    len(eqtl_df[eqtl_df['adj_pval'] <= args.fdr_threshold]),
+    #    args.fdr_threshold))
     logger.write('  * eQTLs mapped at MAF >= {} and pval threshold <={}.'.format(
         args.maf_threshold, args.pval_threshold))
     if not args.suppress_intermediate_files:
         eqtl_df.to_csv(os.path.join(
             args.output_dir, 'eqtls.txt'), sep='\t', index=False)
     eqtl_df = eqtl_df[eqtl_df['adj_pval'] <= args.fdr_threshold]
+    
+    if eqtl_df.empty:
+        logger.write('  * No significant eQTL associations found. Program exiting.')
+        sys.exit()
 
     if not args.do_not_produce_summary:
         if not args.no_afc:
@@ -939,7 +1003,7 @@ def log_settings(args, logger):
         summary.produce_summary(
             eqtl_df, snp_df, gene_df, expression_table_fp,
             args.fdr_threshold, args.output_dir,
-            args.num_processes, args.output_format, args.no_afc, logger)
+            args.num_processes, args.output_format, args.no_afc, logger, args.pchic)
     msg = 'Done.\nTotal time elasped: {:.2f} mins.'.format(
         (time.time() - start_time)/60)
     logger.write(msg)
diff --git a/codes3d/genes.py b/codes3d/genes.py
index cea6e11..271947c 100755
--- a/codes3d/genes.py
+++ b/codes3d/genes.py
@@ -12,7 +12,7 @@
 from itertools import repeat
 
 
-def get_gene_fragments(gene_df, restriction_enzymes, db):
+def get_gene_fragments(gene_df, restriction_enzymes, db, pchic=False):
     db.dispose()
     gene_df = gene_df.sort_values(by=['id'])
     fragment_df = []
@@ -20,20 +20,42 @@ def get_gene_fragments(gene_df, restriction_enzymes, db):
     chunks = [gene_df[i:i+chunksize]
               for i in range(0, gene_df.shape[0], chunksize)]
     for enzyme in restriction_enzymes:
-        table = 'gene_lookup_{}'
+        if pchic:
+            table = 'gene_lookup_pchic_{}'
+        else:
+            table = 'gene_lookup_{}'
+
         if enzyme in ['MboI', 'DpnII']:  # MboI and DpnII have the same restriction sites
             table = table.format('mboi')
         else:
             table = table.format(enzyme.lower())
-        sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}'''
+        
+        if pchic:
+            sql = '''SELECT * FROM {} WHERE gencode_id = '{}' '''
+        else:
+            sql = '''SELECT * FROM {} WHERE id >= {} AND id <= {}'''
+        
         with db.connect() as con:
             for chunk in chunks:
-                df = pd.read_sql(sql.format(
-                    table, chunk['id'].min(), chunk['id'].max()), con=con)
-                df['enzyme'] = enzyme
-                fragment_df.append(df)
-    fragment_df = pd.concat(fragment_df)
-    gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom'])
+                if not pchic:
+                    df = pd.read_sql(sql.format(
+                        table, chunk['id'].min(), chunk['id'].max()), con=con)
+                    df['enzyme'] = enzyme
+                    fragment_df.append(df)
+                else:
+                    for _, row in chunk.iterrows():
+                        df = pd.read_sql(sql.format(
+                            table, row['gencode_id']), con=con)
+                        df['enzyme'] = enzyme
+                        fragment_df.append(df)
+    fragment_df = pd.concat(fragment_df).drop_duplicates()
+    if pchic:
+        gene_df = pd.merge(gene_df, fragment_df, how='inner',
+                left_on = ['chrom','start','end','name','gencode_id'],
+                right_on = ['chr', 'start', 'end', 'gene', 'gencode_id'])
+    else:
+        gene_df = pd.merge(gene_df, fragment_df, how='inner', on=['id', 'chrom'])
+    
     return gene_df
 
 
@@ -70,59 +92,88 @@ def process_snp_genes(
 def find_snp_genes(
         chunk_df,
         enzyme,
-        enzyme_genes):
+        enzyme_genes,
+        pchic=False):
     db.dispose()
-    chunk_df = chunk_df.sort_values(by=['fragment'])
-    chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0]
-    table = 'gene_lookup_{}'
+    if pchic:
+        #celline = chunk_df['cell_line'].drop_duplicates().to_list()
+        table = 'gene_lookup_pchic_{}'
+    else:
+        chunk_df = chunk_df.sort_values(by=['fragment'])
+        chrom = chunk_df['fragment_chr'].drop_duplicates().to_list()[0]
+        table = 'gene_lookup_{}'
+
     if enzyme in ['MboI', 'DpnII']:  # MboI and DpnII have the same restriction sites
         table = table.format('mboi')
     else:
         table = table.format(enzyme.lower())
-    chunk_df['fragment'] = chunk_df['fragment'].astype(int)
-    sql = ''' SELECT * FROM {0}
-    JOIN genes on {0}.id=genes.id
-    WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}'''
-    sql = sql.format(table, chrom,
+    
+    if pchic:
+        inter_df_ls = chunk_df['inter_fid'].unique().tolist()
+        if len(inter_df_ls) > 1:
+            inter_df_ls = tuple(inter_df_ls)
+            sql = '''SELECT * FROM {} WHERE frag_id IN {}'''.format(table, inter_df_ls)
+        else:
+            inter_df_ls = (inter_df_ls[0])
+            sql = '''SELECT * FROM {} WHERE frag_id = {}'''.format(table, inter_df_ls)
+    else:
+        chunk_df['fragment'] = chunk_df['fragment'].astype(int)
+        sql = ''' SELECT * FROM {0}
+        JOIN genes on {0}.id=genes.id
+        WHERE {0}.chrom = '{1}' AND {0}.frag_id >= {2} AND {0}.frag_id <= {3}'''
+        sql = sql.format(table, chrom,
                      chunk_df['fragment'].min(), chunk_df['fragment'].max())
     df = pd.DataFrame()
     with db.connect() as con:
         df = pd.read_sql_query(sql, con)
     if df.empty:
         return
-    df = df.loc[:, ~df.columns.duplicated()]
-    df = df.rename(
-        columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr',
-                 'start': 'gene_start', 'end': 'gene_end'})
-    chunk_df = pd.merge(chunk_df, df, how='inner',
+
+    if pchic:
+        df = df.rename(
+                columns={'chr': 'gene_chr', 'start': 'gene_start', 'end': 'gene_end'})
+        chunk_df = pd.merge(chunk_df, df, how= 'inner', sort=False, left_on='inter_fid',
+                right_on='frag_id')
+    else:
+        df = df.loc[:, ~df.columns.duplicated()]
+        df = df.rename(
+            columns={'id': 'gene_id', 'name': 'gene', 'chrom': 'gene_chr',
+                    'start': 'gene_start', 'end': 'gene_end'})
+        chunk_df = pd.merge(chunk_df, df, how='inner',
                         left_on=['fragment_chr', 'fragment'], right_on=['gene_chr', 'frag_id'])
 
     enzyme_genes.append(chunk_df)
 
 
-def fetch_hic_libs(db):
+def fetch_3dgi_libs(db, pchic=False):
     with db.connect() as con:
-        hic_libs = pd.read_sql_query(
-            'SELECT library, enzyme, rep_count FROM meta_hic',
-        con=con)
-        return hic_libs.drop_duplicates()
-
+        if pchic:
+            _3dgi_libs = pd.read_sql_query(
+                    'SELECT library, enzyme, rep_count FROM meta_pchic', con=con)
+        else:
+            _3dgi_libs = pd.read_sql_query(
+                    'SELECT library, enzyme, rep_count FROM meta_hic', con=con)
+        
+    return _3dgi_libs.drop_duplicates()
 
 def get_gene_by_id(
         snp_df,
         inter_df,
         _db,
         logger,
-):
-    logger.write('Identifying genes interacting with SNPs in...')
+        pchic=False):
+    if pchic:
+        logger.write('Identifying gene promoters interacting with SNPs in...')
+    else:
+        logger.write('Identifying genes interacting with SNPs in...')
     global db
     db = _db
     start_time = time.time()
     enzymes = inter_df['enzyme'].drop_duplicates().tolist()
     all_genes_df = []
     #db = create_engine(db_url, echo=False, poolclass=NullPool)
-    hic_libs = fetch_hic_libs(db)
-    hic_libs = hic_libs.rename(columns={'rep_count': 'cell_line_replicates'})
+    _3dgi_libs = fetch_3dgi_libs(db, pchic)
+    _3dgi_libs = _3dgi_libs.rename(columns={'rep_count': 'cell_line_replicates'})
     for enzyme in enzymes:
         manager = multiprocessing.Manager()
         num_processes = int(min(16, multiprocessing.cpu_count()/2))
@@ -130,79 +181,112 @@ def get_gene_by_id(
         enzyme_df = []
         with multiprocessing.Pool(processes=num_processes) as pool:
             df = inter_df[inter_df['enzyme'] == enzyme]
-            snp_interactions = [
-                df[df['fragment_chr'] == chrom]
-                for chrom in df['fragment_chr'].drop_duplicates().to_list()
-            ]
-            desc = '  * Hi-C libraries restricted with {}'.format(enzyme)
+            if pchic:
+                df_subset = df[['p_fid', 'oe_fid', 'n_reads', 'score',
+                    'query_type', 'query_fragment', 'replicate', 'cell_line', 'enzyme']]
+                df_subset['inter_fid'] = np.where(df_subset['query_fragment'] ==
+                        df_subset['p_fid'], df_subset['oe_fid'], df_subset['p_fid'])
+                snp_interactions = [df_subset[df_subset['cell_line'] == celline]
+                        for celline in df_subset['cell_line'].to_list()
+                        ]
+                desc = '  * PCHi-C libraries restricted with {}'.format(enzyme)
+            else:
+                snp_interactions = [
+                    df[df['fragment_chr'] == chrom]
+                    for chrom in df['fragment_chr'].drop_duplicates().to_list()
+                ]
+                desc = '  * Hi-C libraries restricted with {}'.format(enzyme)
             bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}'
-            '''
-            for snp in snp_interactions:
-                find_snp_genes(snp,
-                               enzyme,
-                               enzyme_genes,
-                               db)
-            '''
             for _ in tqdm.tqdm(pool.istarmap(
                     find_snp_genes,
                     zip(snp_interactions,
                         repeat(enzyme),
-                        repeat(enzyme_genes))),
+                        repeat(enzyme_genes),
+                        repeat(pchic))),
                     total=len(snp_interactions), desc=desc, unit='batches',
                     ncols=80, bar_format=bar_format):
                 pass
-            
         for df in enzyme_genes:
             enzyme_df.append(df)
         enzyme_df = pd.concat(enzyme_df)
         enzyme_df = enzyme_df.merge(
-            hic_libs, how='left',
+            _3dgi_libs, how='left',
             left_on=['cell_line', 'enzyme'], right_on=['library', 'enzyme'])
-        enzyme_df['interactions'] = enzyme_df.groupby(
-            ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
-            'fragment'].transform('count')
-        df = enzyme_df[
-            ['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']
-        ].drop_duplicates()
-        df['replicates'] = df.groupby(
-            ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
-            'replicate'].transform('count')
-        enzyme_df = enzyme_df.merge(
-            df, how='left',
-            on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'])
-        enzyme_df = enzyme_df.drop(
-            columns=['fragment_chr', 'fragment',
+        if pchic:
+            enzyme_df = enzyme_df.drop(
+                    columns=['p_fid','oe_fid','library','frag_id','cell_line_replicates'])
+        else:
+            enzyme_df['interactions'] = enzyme_df.groupby(
+                ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
+                'fragment'].transform('count')
+            df = enzyme_df[
+                ['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate']
+            ].drop_duplicates()
+            df['replicates'] = df.groupby(
+                ['query_chr', 'query_fragment', 'gencode_id', 'cell_line'])[
+                'replicate'].transform('count')
+            enzyme_df = enzyme_df.merge(
+                df, how='left',
+                on=['query_chr', 'query_fragment', 'gencode_id', 'cell_line', 'replicate'])
+            enzyme_df = enzyme_df.drop(
+                columns=['fragment_chr', 'fragment',
                      'frag_id', 'library', 'replicate']
-        )
+            )
         enzyme_df = enzyme_df.drop_duplicates()
         all_genes_df.append(enzyme_df.drop_duplicates())
-    logger.write('  * Filtering gene interactions...')
+    if not pchic:
+        logger.write('  * Filtering gene interactions...')
     all_genes_df = pd.concat(all_genes_df)
-    all_genes_df = all_genes_df.merge(
-        snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'],
-        right_on=['fragment', 'chrom', 'enzyme'],  how='inner'
-    )  # .drop_duplicates()
-    all_genes_df['sum_cell_lines'] = all_genes_df.groupby(
-        ['snp', 'gene'])['cell_line'].transform('count')
-    all_genes_df['sum_interactions'] = all_genes_df.groupby(
-        ['snp', 'gene'])[
-        'interactions'].transform('sum')
-    all_genes_df['sum_replicates'] = all_genes_df.groupby(
-        ['snp', 'gene'])[
-        'replicates'].transform('sum')
-    condition = (
-        (all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) &
-        (all_genes_df['sum_replicates'] < 2) &
-        (all_genes_df['sum_cell_lines'] < 2))
-    gene_df = all_genes_df[~condition].drop_duplicates()
-    cols = ['snp', 'chrom', 'locus', 'variant_id',
+
+    if pchic:
+        all_genes_df = all_genes_df.drop_duplicates()
+        df = all_genes_df[['query_fragment','gencode_id','cell_line','n_reads',
+            'score']].drop_duplicates()
+        df['N_reads'] = df.groupby(['query_fragment','gencode_id','cell_line'])[
+                'n_reads'].transform('sum')
+        df['Score'] = df.groupby(['query_fragment','gencode_id','cell_line','N_reads'])[
+                'score'].transform('mean').round(2)
+        df = df[['query_fragment','gencode_id','cell_line','N_reads','Score']].drop_duplicates()
+        all_genes_df = all_genes_df.merge(
+                df, on=['query_fragment','gencode_id','cell_line'],
+                how='left'
+                ).drop(['n_reads','score','inter_fid'], axis=1).drop_duplicates()
+        all_genes_df = all_genes_df.merge(
+                snp_df, left_on=['query_fragment','enzyme'],
+                right_on=['fragment','enzyme'],
+                how='inner'
+                ).drop_duplicates()
+        gene_df = all_genes_df[['snp', 'chrom', 'locus', 'variant_id',
             'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end',
-            'interactions', 'replicates', 'cell_line', 'cell_line_replicates',
-            'sum_interactions', 'sum_replicates', 'sum_cell_lines']
+            'N_reads', 'Score', 'cell_line']].drop_duplicates()
+    else:
+        all_genes_df = all_genes_df.merge(
+            snp_df, left_on=['query_fragment', 'query_chr', 'enzyme'],
+            right_on=['fragment', 'chrom', 'enzyme'],  how='inner'
+        )  # .drop_duplicates()
+        all_genes_df['sum_cell_lines'] = all_genes_df.groupby(
+            ['snp', 'gene'])['cell_line'].transform('count')
+        all_genes_df['sum_interactions'] = all_genes_df.groupby(
+            ['snp', 'gene'])[
+            'interactions'].transform('sum')
+        all_genes_df['sum_replicates'] = all_genes_df.groupby(
+            ['snp', 'gene'])[
+            'replicates'].transform('sum')
+        condition = (
+            (all_genes_df['interactions'] / all_genes_df['cell_line_replicates'] <= 1) &
+            (all_genes_df['sum_replicates'] < 2) &
+            (all_genes_df['sum_cell_lines'] < 2))
+        gene_df = all_genes_df[~condition].drop_duplicates()
+        cols = ['snp', 'chrom', 'locus', 'variant_id',
+                'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end',
+                'interactions', 'replicates', 'cell_line', 'cell_line_replicates',
+                'sum_interactions', 'sum_replicates', 'sum_cell_lines']
     logger.write('  Time elasped: {:.2f} mins.'.format(
         (time.time() - start_time)/60))
-    return gene_df[cols].drop_duplicates()
-
+    if pchic:
+        return gene_df
+    else:
+        return gene_df[cols].drop_duplicates()
 
 def get_gene_by_position(df, db):
     gene_df = []
@@ -288,6 +372,7 @@ def get_gene_info(
         output_dir,
         db,
         logger,
+        pchic = False,
         suppress_intermediate_files=False):
     enzymes = hic_df['enzyme'].drop_duplicates().tolist()
     gene_df = []
@@ -340,7 +425,10 @@ def get_gene_info(
             len(omitted_genes))
         msg = msg + ':\n\t{}'.format(', '.join(omitted_genes))
         logger.write(msg)
-
-    gene_df = get_gene_fragments(gene_df, enzymes, db)
+    gene_df = get_gene_fragments(gene_df, enzymes, db, pchic)
     gene_df = gene_df.rename(columns={'frag_id': 'fragment'})
+    if pchic:
+        gene_df = gene_df.drop(['chr','gene'], axis=1)
+    else:
+        pass
     return(gene_df)
diff --git a/codes3d/interactions.py b/codes3d/interactions.py
index 4fb486d..ba02b30 100755
--- a/codes3d/interactions.py
+++ b/codes3d/interactions.py
@@ -17,47 +17,103 @@ def get_cell_interactions(
         replicate,
         query_type,
         df,
-        interactions):
+        interactions,
+        pchic=False):
     cell_line = os.path.dirname(replicate).split('/')[-1]
     rep_interactions = []
     
     # TODO: Add implementation for PostgreSQL database
     
-    sql = '''
-    SELECT chr2, fragment2 FROM interactions WHERE chr1='{}' AND fragment1={}
-    '''
-    with create_engine('sqlite:///{}'.format(replicate), echo=False).connect() as con:
-        for idx, row in df.iterrows():
-            from_db = pd.read_sql_query(sql.format(
-                row['chrom'][3:], row['fragment']), con=con)
-            if not from_db.empty:
-                from_db = from_db.rename(
-                    columns={'chr2': 'fragment_chr', 'fragment2': 'fragment'})
-                #from_db[query_type] = row[query_type]
-                from_db['fragment_chr'] = 'chr' + \
-                    from_db['fragment_chr'].astype(str)
-                from_db['query_type'] = query_type
-                from_db['query_chr'] = row['chrom']
-                from_db['query_fragment'] = row['fragment']
-                from_db['fragment'] = from_db['fragment'].astype('int64')
-                rep_interactions.append(from_db)
-    if len(rep_interactions) == 0:
-        return
-    rep_interactions = pd.concat(rep_interactions)
-    # A fragment can have many interactions with the same fragment
-    rep_interactions = rep_interactions.drop_duplicates()  # Keep unique replicates
-    rep = os.path.basename(replicate).split('.')[0]
-    rep = rep[:rep.rfind('_merged')].split('_')[-1]
-    rep_interactions['replicate'] = rep
-    rep_interactions['cell_line'] = cell_line
-    rep_interactions['enzyme'] = enzyme
-    self_ligation = (  # Filter self-ligating fragments
-        (rep_interactions['fragment_chr'] == rep_interactions['query_chr']) &\
-        (rep_interactions['fragment'] == rep_interactions['query_fragment']))
-    rep_interactions = rep_interactions[~self_ligation]
+    if pchic:
+        sql = '''SELECT * from interactions WHERE p_fid={} OR oe_fid={}'''
+        with create_engine('sqlite:///{}'.format(replicate), echo=False).connect() as con:
+            for idx, row in df.iterrows():
+                from_db = pd.read_sql_query(sql.format(
+                    row['fragment'], row['fragment']), con=con)
+                if not from_db.empty:
+                    from_db_subset = from_db[['p_fid','p_name','oe_fid','oe_name',
+                        'n_reads','score']]
+                    from_db_subset['query_type'] = query_type
+                    from_db_subset['query_fragment'] = row['fragment']
+                    rep_interactions.append(from_db_subset)
+            if len(rep_interactions) == 0:
+                return
+            rep_interactions = pd.concat(rep_interactions)
+            rep_interactions = rep_interactions.drop_duplicates()
+            rep = os.path.basename(replicate).split('.')[0]
+            rep_interactions['replicate'] = rep
+            rep_interactions['cell_line'] = cell_line
+            rep_interactions['enzyme'] = enzyme
+            interactions.append(rep_interactions)
 
-    interactions.append(rep_interactions)
+    else:
+        sql = '''
+              SELECT chr2, fragment2 FROM interactions WHERE chr1='{}' AND fragment1={}
+              '''
+        with create_engine('sqlite:///{}'.format(replicate), echo=False).connect() as con:
+            for idx, row in df.iterrows():
+                from_db = pd.read_sql_query(sql.format(
+                    row['chrom'][3:], row['fragment']), con=con)
+                if not from_db.empty:
+                    from_db = from_db.rename(
+                        columns={'chr2': 'fragment_chr', 'fragment2': 'fragment'})
+                    #from_db[query_type] = row[query_type]
+                    from_db['fragment_chr'] = 'chr' + \
+                        from_db['fragment_chr'].astype(str)
+                    from_db['query_type'] = query_type
+                    from_db['query_chr'] = row['chrom']
+                    from_db['query_fragment'] = row['fragment']
+                    from_db['fragment'] = from_db['fragment'].astype('int64')
+                    rep_interactions.append(from_db)
+        if len(rep_interactions) == 0:
+            return
+        rep_interactions = pd.concat(rep_interactions)
+        # A fragment can have many interactions with the same fragment
+        rep_interactions = rep_interactions.drop_duplicates()  # Keep unique replicates
+        rep = os.path.basename(replicate).split('.')[0]
+        rep = rep[:rep.rfind('_merged')].split('_')[-1]
+        rep_interactions['replicate'] = rep
+        rep_interactions['cell_line'] = cell_line
+        rep_interactions['enzyme'] = enzyme
+        self_ligation = (  # Filter self-ligating fragments
+            (rep_interactions['fragment_chr'] == rep_interactions['query_chr']) &\
+            (rep_interactions['fragment'] == rep_interactions['query_fragment']))
+        rep_interactions = rep_interactions[~self_ligation]
+        interactions.append(rep_interactions)
 
+def filter_inter_for_pchic(interactions, query_type):
+    pchic_interactions = []
+    if query_type == 'snp':
+        for row in interactions.itertuples(index=False):
+            # remove snps that interact with unannotated oe_frag
+            if row.query_fragment == row.p_fid  and \
+                row.oe_name != ".":
+                pchic_interactions.append(row)
+            elif row.query_fragment == row.oe_fid and \
+                row.oe_name != ".":
+                pchic_interactions.append(row)
+            elif row.query_fragment == row.oe_fid and \
+                row.oe_name == ".":
+                pchic_interactions.append(row)
+            elif row.query_fragment == row.p_fid and \
+                row.oe_name == ".":
+                pass
+    elif query_type == 'gencode_id':
+        for row in interactions.itertuples(index=False):
+            if (row.query_fragment == row.p_fid)  and \
+                row.oe_name != ".":
+                pchic_interactions.append(row)
+            elif row.query_fragment == row.oe_fid and \
+                row.oe_name != ".":
+                pchic_interactions.append(row)
+            elif row.query_fragment == row.p_fid and \
+                row.oe_name == ".":
+                pchic_interactions.append(row)
+            elif row.query_fragment == row.oe_fid and \
+                row.oe_name == ".":
+                pass
+    pchic_interactions = pd.DataFrame(pchic_interactions)
+    return pchic_interactions
 
 def find_interactions(
         fragment_df,
@@ -66,16 +122,19 @@ def find_interactions(
         hic_df,
         num_processes,
         logger,
+        pchic=False,
         suppress_intermediate_files=False):
     """Finds fragments that interact with gene fragments.
     """
     start_time = time.time()
     logger.write("Finding chromatin interactions in...")
-    # print(fragment_df)
     restriction_enzymes = fragment_df['enzyme'].drop_duplicates().tolist()
     interactions_df = []
     for enzyme in restriction_enzymes:
-        hic_data_dir = os.path.join(lib_dir, 'hic', enzyme, 'hic_data')
+        if pchic:
+            _3dgi_data_dir = os.path.join(lib_dir, 'pchic', enzyme, 'pchic_data')
+        else:
+            _3dgi_data_dir = os.path.join(lib_dir, 'hic', enzyme, 'hic_data')
         manager = multiprocessing.Manager()
         gene_interactions = manager.list()
         enzyme_qc_df = manager.list()
@@ -84,16 +143,19 @@ def find_interactions(
         cell_lines = hic_df[hic_df['enzyme'] == enzyme]['library'].tolist()
         replicates = []
         for cell_line in cell_lines:  # TODO: Replace with database
-            replicates += [os.path.join(hic_data_dir, cell_line, replicate)
+            replicates += [os.path.join(_3dgi_data_dir, cell_line, replicate)
                            for replicate in os.listdir(
-                os.path.join(hic_data_dir, cell_line))
+                os.path.join(_3dgi_data_dir, cell_line))
                 if replicate.endswith('.db')]
         enzyme_df = fragment_df[fragment_df['enzyme'] == enzyme]
         enzyme_df = enzyme_df[['fragment',
                                'chrom', 'enzyme']].drop_duplicates()
 
         with multiprocessing.Pool(num_processes) as pool:
-            desc = '  * Hi-C libraries restricted with {}'.format(enzyme)
+            if pchic:
+                desc = '  * PCHi-C libraries restricted with {}'.format(enzyme)
+            else:
+                desc = '  * Hi-C libraries restricted with {}'.format(enzyme)
             bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}'
             bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}'
             for _ in tqdm.tqdm(
@@ -103,7 +165,8 @@ def find_interactions(
                                       replicates,
                                       repeat(query_type),
                                       repeat(enzyme_df),
-                                      repeat(gene_interactions)
+                                      repeat(gene_interactions),
+                                      repeat(pchic)
                                   )
                                   ),
                     total=len(replicates),
@@ -114,12 +177,26 @@ def find_interactions(
                 pass
         for df in gene_interactions:
             interactions_df.append(df)
-    interactions_df = pd.concat(interactions_df)
-    cols = ['cell_line', 'query_type', 'query_chr', 'query_fragment',
-            'fragment_chr', 'fragment',  # 'fragment_distance',
-            'replicate', 'enzyme']
-    interactions_df = interactions_df[cols]
-    interactions_df = interactions_df.drop_duplicates()
-    logger.write('  Time elasped: {:.2f} mins.'.format(
-        (time.time() - start_time)/60))
+    if not interactions_df:
+        msg = '''Program exiting: Chromatin interactions involving input
+                SNP(s)/gene(s) could not be found.'''
+        sys.exit(msg)
+        logger.write(msg)
+    else:
+        interactions_df = pd.concat(interactions_df)
+    if pchic:
+        cols = ['p_fid', 'p_name', 'oe_fid', 'oe_name', 'n_reads',
+                'score', 'query_type', 'query_fragment',
+                'replicate', 'cell_line', 'enzyme']
+        interactions_df = interactions_df[cols]
+        interactions_df = interactions_df.drop_duplicates()
+        interactions_df = filter_inter_for_pchic(interactions_df, query_type)
+        logger.write('  Time elasped: {:.2f} mins.'.format((time.time() - start_time)/60))
+    else:
+        cols = ['cell_line', 'query_type', 'query_chr', 'query_fragment',
+                'fragment_chr', 'fragment', 'replicate', 'enzyme']
+        interactions_df = interactions_df[cols]
+        interactions_df = interactions_df.drop_duplicates()
+        logger.write('  Time elasped: {:.2f} mins.'.format((time.time() - start_time)/60))
+
     return interactions_df
diff --git a/codes3d/snps.py b/codes3d/snps.py
index 4772e6c..edc638a 100755
--- a/codes3d/snps.py
+++ b/codes3d/snps.py
@@ -29,6 +29,7 @@ def find_snps(
         num_processes,
         _db,
         logger,
+        pchic=False,
         suppress_intermediate_files=False
 ):
     start_time = time.time()
@@ -37,17 +38,27 @@ def find_snps(
     global db
     db = _db
     enzymes = inter_df['enzyme'].drop_duplicates().tolist()
-    hic_libs = genes.fetch_hic_libs(db)
-    hic_libs = hic_libs.rename(columns={'rep_count': 'cell_line_replicates'})
+    _3dgi_libs = genes.fetch_3dgi_libs(db, pchic)
+    _3dgi_libs = _3dgi_libs.rename(columns={'rep_count': 'cell_line_replicates'})
     inter_df = inter_df.merge(
-        hic_libs, how='left',
-        left_on=['cell_line', 'enzyme'], right_on=['library', 'enzyme'])
-    default_chrom = ['chr' + str(i)
-                     for i in list(range(1, 23))] + ['X', 'Y', 'M']
-    chrom_list = inter_df['fragment_chr'].drop_duplicates().tolist()
-    chrom_list = [i for i in default_chrom if i in chrom_list]
-    inter_df = inter_df[inter_df['fragment_chr'].isin(default_chrom)]
-    inter_df = inter_df.astype({'fragment': int})
+            _3dgi_libs, how='left',
+            left_on=['cell_line', 'enzyme'], right_on=['library', 'enzyme'])
+    if pchic:
+        inter_df = inter_df[['p_fid','oe_fid','n_reads','score',
+            'query_type', 'query_fragment', 'replicate', 'cell_line', 'enzyme',
+            'library']]
+        inter_df['inter_frag'] = np.where(inter_df['query_fragment'] ==
+            inter_df['p_fid'], inter_df['oe_fid'], inter_df['p_fid'])
+        inter_df = inter_df[['n_reads','score','query_type','query_fragment','inter_frag',
+        'replicate','cell_line','enzyme']].drop_duplicates()
+    else:
+        default_chrom = ['chr' + str(i)
+                        for i in list(range(1, 23))] + ['X', 'Y', 'M']
+        chrom_list = inter_df['fragment_chr'].drop_duplicates().tolist()
+        chrom_list = [i for i in default_chrom if i in chrom_list]
+        inter_df = inter_df[inter_df['fragment_chr'].isin(default_chrom)]
+        inter_df = inter_df.astype({'fragment': int})
+
     gene_info_df = gene_info_df.rename(
         columns={
             'name': 'gene',
@@ -59,113 +70,208 @@ def find_snps(
     all_snps = []
     all_genes = []
     all_eqtls = []
-    logger.write('Finding SNPs within fragments interacting with genes in...')
-    for chrom in sorted(chrom_list):
-        chrom_dir = os.path.join(output_dir, chrom)
-        #if os.path.exists(os.path.join(chrom_dir, 'eqtls.txt')):
-        #    logger.write('  Warning: {} already exists. Skipping.'.format(
-        #        os.path.join(chrom_dir, 'eqtls.txt')))
-        #    continue
-        logger.write(' Chromosome {}'.format(chrom))
-        snp_cols = ['snp', 'variant_id', 'chr',
-                    'locus', 'id', 'fragment', 'enzyme']
-        chrom_df = inter_df[inter_df['fragment_chr'] == chrom]
-        chrom_df = chrom_df.astype({'fragment': int,
-                                    'fragment_chr': object})
-        enzymes = chrom_df['enzyme'].drop_duplicates().tolist()
+    
+    if pchic:
+        logger.write('Finding SNPs within fragments interacting with gene promoters in...')
+    else:
+        logger.write('Finding SNPs within fragments interacting with genes in...')
+    
+    if pchic:
         snp_df = []
         for enzyme in enzymes:
-            enzyme_df = chrom_df[chrom_df['enzyme'] == enzyme]
+            enzyme_dir = os.path.join(output_dir, enzyme)
+            enzyme_df = inter_df[inter_df['enzyme'] == enzyme]
             enzyme_df = enzyme_df.merge(
-                gene_info_df, how='inner',
-                left_on=['query_chr', 'query_fragment', 'enzyme'],
-                right_on=['chrom', 'gene_fragment', 'enzyme'])
+                    gene_info_df, how='inner',
+                    left_on = ['query_fragment', 'enzyme'],
+                    right_on = ['gene_fragment', 'enzyme'])
             fragment_df = enzyme_df[
-                ['gencode_id', 'fragment_chr', 'fragment']].drop_duplicates()
-            enzyme_df = enzyme_df.sort_values(by=['fragment'])
+                    ['gencode_id','query_fragment','inter_frag','project']].drop_duplicates()
+            enzyme_df = enzyme_df.drop(columns=['project']).drop_duplicates()
+            enzyme_df = enzyme_df.sort_values(by=['inter_frag'])
             chunksize = 20000
             enzyme_chunks = [enzyme_df[i:i+chunksize]
-                             for i in range(0, enzyme_df.shape[0], chunksize)]
+                    for i in range(0, enzyme_df.shape[0], chunksize)]
             manager = multiprocessing.Manager()
             snps = manager.list()
-            desc = '  * Hi-C libraries restricted with {}'.format(
-                enzyme)
+            desc = '  * PCHi-C libraries restricted with {}'.format(enzyme)
             bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}'
-            '''
-            for df in tqdm.tqdm(enzyme_chunks, desc=desc, unit='batches',
-                                ncols=80, bar_format=bar_format):
-                find_gene_snps(
-                    df,
-                    enzyme,
-                    snps)
-            '''
             with multiprocessing.Pool(processes=4) as pool:
                 for _ in tqdm.tqdm(
                         pool.istarmap(
                             find_gene_snps,
                             zip(enzyme_chunks,
                                 repeat(enzyme),
-                                repeat(snps))
-                        ),
+                                repeat(snps),
+                                repeat(pchic))
+                            ),
                         total=len(enzyme_chunks), desc=desc, unit='batches',
                         ncols=80, bar_format=bar_format):
                     pass
+                for df in snps:
+                    df['enzyme'] = enzyme
+                    snp_df.append(df)
+                if len(snp_df) == 0:
+                    continue
+            snp_df = pd.concat(snp_df)
+            logger.verbose = False
+            gene_df, snp_df = filter_snp_fragments(
+                    snp_df, logger, pchic)
+            snp_df.sort_values(by=['variant_id'], inplace=True)
+            snp_list = snp_df['variant_id'].drop_duplicates().tolist()
+            batchsize = 2000
+            snp_batches = [snp_list[i:i + batchsize]
+                    for i in range(0, len(snp_list), batchsize)]
+            chrom_eqtl_df = []
+            for batch_num, snp_batch in enumerate(snp_batches):
+                if len(snp_batches) > 1:
+                    logger.verbose = True
+                    logger.write('  Mapping eQTLs batch {} of {}'.format(
+                        batch_num+1, len(snp_batches)))
+                    logger.verbose = False
+                batch_gene_df = gene_df[gene_df['variant_id'].isin(snp_batch)]
+                eqtl_df = eqtls.map_eqtls(
+                        batch_gene_df,
+                        tissues,
+                        output_dir,
+                        C,
+                        genotypes_fp,
+                        num_processes,
+                        eqtl_project_db,
+                        covariates_dir,
+                        expression_dir,
+                        pval_threshold,
+                        maf_threshold,
+                        fdr_threshold,
+                        logger)
+                if eqtl_df is None:
+                    continue
+                chrom_eqtl_df.append(eqtl_df)
+            if len(chrom_eqtl_df) > 0:
+                chrom_eqtl_df = pd.concat(chrom_eqtl_df)
+            else:
+                chrom_eqtl_df = pd.DataFrame()
+            if not suppress_intermediate_files:
+                os.makedirs(enzyme_dir, exist_ok=True)
+                snp_df.to_csv(os.path.join(enzyme_dir, 'snps.txt'),
+                        sep='\t', index=False)
+                gene_df.to_csv(os.path.join(enzyme_dir, 'genes.txt'),
+                        sep='\t', index=False)
+                chrom_eqtl_df.to_csv(os.path.join(enzyme_dir, 'eqtls.txt'),
+                        sep='\t', index=False)
+            all_eqtls.append(chrom_eqtl_df)
+            all_snps.append(snp_df)
+            all_genes.append(gene_df)
+            logger.verbose = True
 
-            for df in snps:
-                df['enzyme'] = enzyme
-                snp_df.append(df)
-        if len(snp_df) == 0:
-            continue
-        snp_df = pd.concat(snp_df)
-        logger.verbose = False
-        gene_df, snp_df = filter_snp_fragments(
-            snp_df, logger)
-        snp_df.sort_values(by=['variant_id'], inplace=True)
-        snp_list = snp_df['variant_id'].drop_duplicates().tolist()
-        batchsize = 2000
-        snp_batches = [snp_list[i:i + batchsize]
-                       for i in range(0, len(snp_list), batchsize)]
-        chrom_eqtl_df = []
-        for batch_num, snp_batch in enumerate(snp_batches):
-            if len(snp_batches) > 1:
-                logger.verbose = True
-                logger.write('  Mapping eQTLs batch {} of {}'.format(
-                    batch_num+1, len(snp_batches)))
-                logger.verbose = False
-            batch_gene_df = gene_df[gene_df['variant_id'].isin(snp_batch)]
-            eqtl_df = eqtls.map_eqtls(
-                batch_gene_df,
-                tissues,
-                output_dir,
-                C,
-                genotypes_fp,
-                num_processes,
-                eqtl_project_db,
-                covariates_dir,
-                expression_dir,
-                pval_threshold,
-                maf_threshold,
-                fdr_threshold,
-                logger)
-            if eqtl_df is None:
+    else:
+        for chrom in sorted(chrom_list):
+            chrom_dir = os.path.join(output_dir, chrom)
+            #if os.path.exists(os.path.join(chrom_dir, 'eqtls.txt')):
+            #    logger.write('  Warning: {} already exists. Skipping.'.format(
+            #        os.path.join(chrom_dir, 'eqtls.txt')))
+            #    continue
+            logger.write(' Chromosome {}'.format(chrom))
+            snp_cols = ['snp', 'variant_id', 'chr',
+                        'locus', 'id', 'fragment', 'enzyme']
+            chrom_df = inter_df[inter_df['fragment_chr'] == chrom]
+            chrom_df = chrom_df.astype({'fragment': int,
+                                        'fragment_chr': object})
+            enzymes = chrom_df['enzyme'].drop_duplicates().tolist()
+            snp_df = []
+            for enzyme in enzymes:
+                enzyme_df = chrom_df[chrom_df['enzyme'] == enzyme]
+                enzyme_df = enzyme_df.merge(
+                    gene_info_df, how='inner',
+                    left_on=['query_chr', 'query_fragment', 'enzyme'],
+                    right_on=['chrom', 'gene_fragment', 'enzyme'])
+                fragment_df = enzyme_df[
+                    ['gencode_id', 'fragment_chr', 'fragment']].drop_duplicates()
+                enzyme_df = enzyme_df.sort_values(by=['fragment'])
+                chunksize = 20000
+                enzyme_chunks = [enzyme_df[i:i+chunksize]
+                                 for i in range(0, enzyme_df.shape[0], chunksize)]
+                manager = multiprocessing.Manager()
+                snps = manager.list()
+                desc = '  * Hi-C libraries restricted with {}'.format(
+                    enzyme)
+                bar_format = '{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt} {unit}'
+                '''
+                for df in tqdm.tqdm(enzyme_chunks, desc=desc, unit='batches',
+                                    ncols=80, bar_format=bar_format):
+                    find_gene_snps(
+                        df,
+                        enzyme,
+                        snps)
+                '''
+                with multiprocessing.Pool(processes=4) as pool:
+                    for _ in tqdm.tqdm(
+                            pool.istarmap(
+                                find_gene_snps,
+                                zip(enzyme_chunks,
+                                    repeat(enzyme),
+                                    repeat(snps),
+                                    repeat(pchic))
+                            ),
+                            total=len(enzyme_chunks), desc=desc, unit='batches',
+                            ncols=80, bar_format=bar_format):
+                        pass
+
+                for df in snps:
+                    df['enzyme'] = enzyme
+                    snp_df.append(df)
+            if len(snp_df) == 0:
                 continue
-            chrom_eqtl_df.append(eqtl_df)
-        if len(chrom_eqtl_df) > 0:
-            chrom_eqtl_df = pd.concat(chrom_eqtl_df)
-        else:
-            chrom_eqtl_df = pd.DataFrame()
-        if not suppress_intermediate_files:
-            os.makedirs(chrom_dir, exist_ok=True)
-            snp_df.to_csv(os.path.join(chrom_dir, 'snps.txt'),
-                          sep='\t', index=False)
-            gene_df.to_csv(os.path.join(chrom_dir, 'genes.txt'),
-                           sep='\t', index=False)
-            chrom_eqtl_df.to_csv(os.path.join(chrom_dir, 'eqtls.txt'),
-                                 sep='\t', index=False)
-        all_eqtls.append(chrom_eqtl_df)
-        all_snps.append(snp_df)
-        all_genes.append(gene_df)
-        logger.verbose = True
+            snp_df = pd.concat(snp_df)
+            logger.verbose = False
+            gene_df, snp_df = filter_snp_fragments(
+                snp_df, logger, pchic)
+            snp_df.sort_values(by=['variant_id'], inplace=True)
+            snp_list = snp_df['variant_id'].drop_duplicates().tolist()
+            batchsize = 2000
+            snp_batches = [snp_list[i:i + batchsize]
+                           for i in range(0, len(snp_list), batchsize)]
+            chrom_eqtl_df = []
+            for batch_num, snp_batch in enumerate(snp_batches):
+                if len(snp_batches) > 1:
+                    logger.verbose = True
+                    logger.write('  Mapping eQTLs batch {} of {}'.format(
+                        batch_num+1, len(snp_batches)))
+                    logger.verbose = False
+                batch_gene_df = gene_df[gene_df['variant_id'].isin(snp_batch)]
+                eqtl_df = eqtls.map_eqtls(
+                    batch_gene_df,
+                    tissues,
+                    output_dir,
+                    C,
+                    genotypes_fp,
+                    num_processes,
+                    eqtl_project_db,
+                    covariates_dir,
+                    expression_dir,
+                    pval_threshold,
+                    maf_threshold,
+                    fdr_threshold,
+                    logger)
+                if eqtl_df is None:
+                    continue
+                chrom_eqtl_df.append(eqtl_df)
+            if len(chrom_eqtl_df) > 0:
+                chrom_eqtl_df = pd.concat(chrom_eqtl_df)
+            else:
+                chrom_eqtl_df = pd.DataFrame()
+            if not suppress_intermediate_files:
+                os.makedirs(chrom_dir, exist_ok=True)
+                snp_df.to_csv(os.path.join(chrom_dir, 'snps.txt'),
+                              sep='\t', index=False)
+                gene_df.to_csv(os.path.join(chrom_dir, 'genes.txt'),
+                               sep='\t', index=False)
+                chrom_eqtl_df.to_csv(os.path.join(chrom_dir, 'eqtls.txt'),
+                                     sep='\t', index=False)
+            all_eqtls.append(chrom_eqtl_df)
+            all_snps.append(snp_df)
+            all_genes.append(gene_df)
+            logger.verbose = True
     if len(all_eqtls) == 0:
         snp_df = pd.DataFrame()
         gene_df = pd.DataFrame()
@@ -179,50 +285,87 @@ def find_snps(
         (time.time() - start_time)/60))
     return snp_df, gene_df, eqtl_df
 
-
 def find_gene_snps(
         inter_df,
         enzyme,
-        snps#,
-        #db,
-):
+        snps,
+        pchic=False):
     db.dispose()
     eqtl_project_db.dispose()
-    table = 'variant_lookup_{}'
+    
+    if pchic:
+        table = 'variant_lookup_pchic_{}'
+    else:
+        table = 'variant_lookup_{}'
+    
     if enzyme in ['MboI', 'DpnII']:  # MboI and DpnII have the same restriction sites
         table = table.format('mboi')
     else:
         table = table.format(enzyme.lower())
-    chrom = inter_df['fragment_chr'].drop_duplicates().tolist()[0]
-    sql = '''SELECT * FROM {}  WHERE chrom = '{}' AND frag_id >= {} AND frag_id <= {}'''
-    df = pd.DataFrame()
-    con = eqtl_project_db.connect()
-    res = con.execute(
-        sql.format(
-            table, chrom, inter_df['fragment'].min(), inter_df['fragment'].max())
-    ).fetchall()
-    con.close()
-    if res:
-        df = pd.DataFrame(res, columns=['frag_id', 'chrom', 'id'])
-    inter_df = inter_df.rename(columns={'chrom': 'gene_chr'})
-    df = inter_df.merge(
-        df, how='inner', left_on=['fragment'], right_on=['frag_id'])
-    df['id'] = df['id'].astype('Int64')
-    df = df[df['frag_id'].notnull()]
-    df = df.drop_duplicates()
-    snp_df = find_snp_by_id(df, eqtl_project_db)
-    if snp_df.empty:
-        return
-    snp_df = snp_df.merge(df, how='inner', on=['id', 'chrom'])
-    snp_df['enzyme'] = enzyme
-    snp_df = snp_df.rename(columns={'rsid': 'snp'})
-    snps.append(snp_df.drop_duplicates())
-
     
-def find_snp_by_id(df, db):
+    if not pchic:
+        chrom = inter_df['fragment_chr'].drop_duplicates().tolist()[0]
+        sql = '''SELECT * FROM {}  WHERE chrom = '{}' AND frag_id >= {} AND frag_id <= {}'''
+        df = pd.DataFrame()
+        con = eqtl_project_db.connect()
+        res = con.execute(
+            sql.format(
+                table, chrom, inter_df['fragment'].min(), inter_df['fragment'].max())
+        ).fetchall()
+        con.close()
+        if res:
+            df = pd.DataFrame(res, columns=['frag_id', 'chrom', 'id'])
+        inter_df = inter_df.rename(columns={'chrom': 'gene_chr'})
+        df = inter_df.merge(
+            df, how='inner', left_on=['fragment'], right_on=['frag_id'])
+        df['id'] = df['id'].astype('Int64')
+        df = df[df['frag_id'].notnull()]
+        df = df.drop_duplicates()
+        snp_df = find_snp_by_id(df, eqtl_project_db, pchic)
+        if snp_df.empty:
+            return
+        snp_df = snp_df.merge(df, how='inner', on=['id', 'chrom'])
+        snp_df['enzyme'] = enzyme
+        snp_df = snp_df.rename(columns={'rsid': 'snp'})
+        snps.append(snp_df.drop_duplicates())
+    else:
+        snp_frag = inter_df['inter_frag'].drop_duplicates().tolist()
+        if len(snp_frag) > 1:
+            snp_frag = tuple(snp_frag)
+            sql = '''SELECT * FROM {} WHERE frag_id IN {}'''
+        else:
+            snp_frag = (snp_frag[0])
+            sql = '''SELECT * FROM {} WHERE frag_id = {}'''
+        df = pd.DataFrame()
+        con = eqtl_project_db.connect()
+        res = con.execute(
+                sql.format(
+                    table, snp_frag)
+                ).fetchall()
+        con.close()
+        if res:
+            df = pd.DataFrame(res, columns=['frag_id', 'chrom', 'id'])
+        inter_df = inter_df.rename(columns={'chrom': 'gene_chr'})
+        df = inter_df.merge(
+                df, how='inner', left_on=['inter_frag'], right_on=['frag_id'])
+        df['id'] = df['id'].astype('Int64')
+        df = df[df['frag_id'].notnull()]
+        df = df.drop_duplicates()
+        snp_df = find_snp_by_id(df, eqtl_project_db, pchic)
+        if snp_df.empty:
+            return
+        snp_df = snp_df.merge(df, how='inner', on=['id', 'chrom'])
+        snp_df['enzyme'] = enzyme
+        snp_df = snp_df.rename(columns={'rsid': 'snp'})
+        snps.append(snp_df.drop_duplicates())
+         
+def find_snp_by_id(df, db, pchic=False):
     df = df.sort_values(by=['id'])
-    chunksize = eqtls.calc_chunksize(
-        df['id'].tolist(), 2000000)
+    if not pchic:
+        chunksize = eqtls.calc_chunksize(
+            df['id'].tolist(), 2000000)
+    else:
+        chunksize = 100000
     chunks = [df[i:i+chunksize] for i in range(0, len(df), chunksize)]
     snp_df = []
     sql = '''SELECT rsid, variant_id, chrom, locus, id FROM variants WHERE id >= {}
@@ -258,42 +401,67 @@ def find_snp_by_variant_id(df, db):
 
 def filter_snp_fragments(
         snp_df,
-        logger):
+        logger,
+        pchic=False):
     ''' Filter snp-fragment interactions '''
-    logger.write('  * Filtering gene-SNP interactions...')
-    snp_df['interactions'] = snp_df.groupby(
-        ['variant_id', 'gene', 'cell_line', 'enzyme'])[
-            'gene_fragment'].transform('count')
-    snp_df['replicates'] = snp_df.groupby(
-        ['variant_id', 'gene', 'cell_line', 'enzyme'])[
-            'replicate'].transform('count')
-    snp_df = snp_df.drop(columns=['replicate', 'gene_fragment'])
-    snp_df = snp_df.drop_duplicates()
-    snp_df['sum_interactions'] = snp_df.groupby(
-        ['variant_id', 'gene'])[
-            'interactions'].transform('sum')
-    snp_df['sum_replicates'] = snp_df.groupby(
-        ['variant_id', 'gene'])[
-            'replicates'].transform('sum')
-    snp_df['sum_cell_lines'] = snp_df.groupby(
-        ['variant_id', 'gene'])['cell_line'].transform('count')
-    condition = (
-        (snp_df['interactions'] / snp_df['cell_line_replicates'] <= 1) &
-        (snp_df['sum_replicates'] < 2) &
-        (snp_df['sum_cell_lines'] < 2))
-    gene_df = snp_df[~condition]
-    snp_df = gene_df[
-        ['id', 'snp', 'chrom', 'locus', 'variant_id', 'fragment', 'enzyme']
-    ]
-    snp_df.drop_duplicates(inplace=True)
-    cols = ['snp', 'chrom', 'locus', 'variant_id',
+    if pchic:
+        ''' Calculating N_reads and chicago_scores '''
+        logger.write('  * Calculating cumulative scores for gene promoter-SNP interactions...')
+        snp_df = snp_df.drop_duplicates()
+        snp_df = snp_df.drop(columns=['replicate','id']).drop_duplicates()
+        snp_df['N_reads'] = snp_df.groupby(
+                ['variant_id', 'gencode_id', 'inter_frag',
+                'gene_fragment', 'cell_line'])[
+                'n_reads'].transform('sum')
+        snp_df['Score'] = snp_df.groupby(['variant_id', 'gencode_id', 'inter_frag',
+            'gene_fragment', 'cell_line','N_reads'])[
+            'score'].transform('mean').round(2)
+        gene_df = snp_df[['snp', 'chrom', 'locus', 'variant_id',
             'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end',
-            'interactions', 'replicates', 'enzyme',
-            'cell_line', 'cell_line_replicates', 'sum_interactions',
-            'sum_replicates', 'sum_cell_lines']
+            'enzyme', 'cell_line', 'N_reads', 'Score']].drop_duplicates()
+        snp_df = snp_df[
+                ['snp', 'chrom', 'locus', 'variant_id','frag_id','enzyme']]
+        snp_df.drop_duplicates(inplace=True)
+        cols = ['snp', 'chrom', 'locus', 'variant_id',
+                'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end',
+                'enzyme', 'cell_line', 'N_reads', 'Score']
+    else:
+        logger.write('  * Filtering gene-SNP interactions...')
+        snp_df['interactions'] = snp_df.groupby(
+            ['variant_id', 'gene', 'cell_line', 'enzyme'])[
+                'gene_fragment'].transform('count')
+        snp_df['replicates'] = snp_df.groupby(
+            ['variant_id', 'gene', 'cell_line', 'enzyme'])[
+                'replicate'].transform('count')
+        snp_df = snp_df.drop(columns=['replicate', 'gene_fragment'])
+        snp_df = snp_df.drop_duplicates()
+        snp_df['sum_interactions'] = snp_df.groupby(
+            ['variant_id', 'gene'])[
+                'interactions'].transform('sum')
+        snp_df['sum_replicates'] = snp_df.groupby(
+            ['variant_id', 'gene'])[
+                'replicates'].transform('sum')
+        snp_df['sum_cell_lines'] = snp_df.groupby(
+            ['variant_id', 'gene'])['cell_line'].transform('count')
+        condition = (
+            (snp_df['interactions'] / snp_df['cell_line_replicates'] <= 1) &
+            (snp_df['sum_replicates'] < 2) &
+            (snp_df['sum_cell_lines'] < 2))
+        gene_df = snp_df[~condition]
+        snp_df = gene_df[
+            ['id', 'snp', 'chrom', 'locus', 'variant_id', 'fragment', 'enzyme']
+        ]
+        snp_df.drop_duplicates(inplace=True)
+        cols = ['snp', 'chrom', 'locus', 'variant_id',
+                'gene', 'gencode_id', 'gene_chr', 'gene_start', 'gene_end',
+                'interactions', 'replicates', 'enzyme',
+                'cell_line', 'cell_line_replicates', 'sum_interactions',
+                'sum_replicates', 'sum_cell_lines']
     return gene_df[cols], snp_df
 
 
+
+
 def process_rs_df_whole(rs_df, db):
     rsid_tuple = ()
     if len(rs_df) == 1:
@@ -371,14 +539,17 @@ def get_snp_fragments_whole(snp_df, restriction_enzymes, db):
     return snp_df
 
 
-def get_snp_fragments(snp_df, restriction_enzymes, db):
+def get_snp_fragments(snp_df, restriction_enzymes, db, pchic=False):
     snp_df = snp_df.sort_values(by=['id'])
     fragment_df = []
     chunksize = 1000
     chunks = [snp_df[i:i+chunksize]
               for i in range(0, snp_df.shape[0], chunksize)]
     for enzyme in restriction_enzymes:
-        table = 'variant_lookup_{}'
+        if pchic:
+            table = 'variant_lookup_pchic_{}'
+        else:
+            table = 'variant_lookup_{}'
         df = []
         if enzyme in ['MboI', 'DpnII']:  # MboI and DpnII have the same restriction sites
             table = table.format('mboi')
@@ -563,12 +734,13 @@ def get_snp(inputs,
             db,
             rs_merge_arch_fp,
             logger,
+            pchic=False,
             suppress_intermediate_files=False
             ):
     """Retrieve SNP position and restriction fragments.
     Args:
     inputs: File(s) (or stdin) containing SNP rsIDs or genomic positions in bed format (chr:start-end)>
-    restriction_enzymes: a list of restriction enzymes with which query Hi-C libraries are prepared
+    restriction_enzymes: a list of restriction enzymes with which query Hi-C/PCHi-C libraries are prepared
     output_dir: User-specified directory for results. Defaults to inputs directory.
     postgres_url: path to codes3d_common database
     suppress_intermediate_files: if 'False', snps.txt file is written to output_dir
@@ -594,7 +766,7 @@ def get_snp(inputs,
         logger.write('We  could not find your SNPs in our databases.')
         sys.exit()
     snp_df[['id', 'locus']] = snp_df[['id', 'locus']].astype(int)
-    snp_df = get_snp_fragments(snp_df, restriction_enzymes, db)
+    snp_df = get_snp_fragments(snp_df, restriction_enzymes, db, pchic)
     snp_df = snp_df.rename(columns={'rsid': 'snp', 'frag_id': 'fragment'})
     if not suppress_intermediate_files:
         if not merged_snps.empty:
diff --git a/codes3d/summary.py b/codes3d/summary.py
index 74eeda6..03b8d98 100755
--- a/codes3d/summary.py
+++ b/codes3d/summary.py
@@ -17,7 +17,7 @@
 def produce_summary(
         eqtl_df, snp_df, gene_df,
         expression_table_fp, fdr_threshold, output_dir, num_processes,
-        output_format, no_afc, logger):
+        output_format, no_afc, logger, pchic=False):
     """Write final results of eQTL-eGene associations
 
     Args:
@@ -43,46 +43,75 @@ def produce_summary(
     eqtl_df['sid_chr'] = eqtl_df['sid_chr'].str[3:]
     gene_df['pid'] = gene_df['gencode_id'].str.split('.', expand=True)[0]
     cols = []
-
-    logger.write("  * Computing Hi-C data...")
-    df = gene_df[['snp', 'gencode_id', 'cell_line', 'interactions',
-                  'replicates', 'cell_line_replicates']].merge(
-        eqtl_df[['snp', 'gencode_id']],  on=['snp', 'gencode_id'], how='inner')
-    df['hic_score'] = df['interactions'] / df['cell_line_replicates']
-    df = df.drop_duplicates()
-    grouped_df = df.groupby(['snp', 'gencode_id'])
-    df['cell_line_hic_scores'] = df.apply(lambda row: ', '.join(
-        grouped_df.get_group((row['snp'], row['gencode_id']))[
-            'hic_score'].round(2).astype(str).values.tolist()), axis=1)
-    df['hic_score'] = df.apply(lambda row: grouped_df.get_group(
-        (row['snp'], row['gencode_id']))[
-        'hic_score'].sum().round(2), axis=1)
-    df['cell_lines'] = df.apply(lambda row: ', '.join(
-        grouped_df.get_group((row['snp'], row['gencode_id']))[
-            'cell_line'].values.tolist()), axis=1)
-    eqtl_df = eqtl_df.merge(df[
-        ['snp', 'gencode_id', 'cell_lines', 'hic_score', 'cell_line_hic_scores']
-    ].drop_duplicates(), on=['snp', 'gencode_id'], how='left')
-
+    if pchic:
+        df = gene_df[['snp', 'gencode_id', 'cell_line','N_reads','Score']].merge(
+                eqtl_df[['snp', 'gencode_id']],  on=['snp', 'gencode_id'], how='inner')
+        df = df.drop_duplicates()
+        grouped_df = df.groupby(['snp', 'gencode_id'])
+        df['cell_lines'] = df.apply(lambda row: ', '.join(
+            grouped_df.get_group((row['snp'], row['gencode_id']))[
+                'cell_line'].values.tolist()), axis=1)
+        df['chicago_scores'] = df.apply(lambda row: ', '.join(
+            grouped_df.get_group((row['snp'], row['gencode_id']))[
+                'Score'].astype(str).values.tolist()), axis=1)
+        df['N_reads_per_cell_line'] = df.apply(lambda row: ', '.join(
+            grouped_df.get_group((row['snp'], row['gencode_id']))[
+                'N_reads'].astype(str).values.tolist()), axis=1)
+        eqtl_df = eqtl_df.merge(df[
+            ['snp', 'gencode_id', 'cell_lines', 'N_reads_per_cell_line', 'chicago_scores']
+            ].drop_duplicates(), on=['snp', 'gencode_id'], how='left')
+     
+         
+    else:
+        logger.write("  * Computing Hi-C data...")
+        df = gene_df[['snp', 'gencode_id', 'cell_line', 'interactions',
+            'replicates', 'cell_line_replicates']].merge(
+            eqtl_df[['snp', 'gencode_id']],  on=['snp', 'gencode_id'], how='inner')
+        df['hic_score'] = df['interactions'] / df['cell_line_replicates']
+        df = df.drop_duplicates()
+        grouped_df = df.groupby(['snp', 'gencode_id'])
+        df['cell_line_hic_scores'] = df.apply(lambda row: ', '.join(
+            grouped_df.get_group((row['snp'], row['gencode_id']))[
+                'hic_score'].round(2).astype(str).values.tolist()), axis=1)
+        df['hic_score'] = df.apply(lambda row: grouped_df.get_group(
+            (row['snp'], row['gencode_id']))[
+            'hic_score'].sum().round(2), axis=1)
+        df['cell_lines'] = df.apply(lambda row: ', '.join(
+            grouped_df.get_group((row['snp'], row['gencode_id']))[
+                'cell_line'].values.tolist()), axis=1)
+        eqtl_df = eqtl_df.merge(df[
+            ['snp', 'gencode_id', 'cell_lines', 'hic_score', 'cell_line_hic_scores']
+            ].drop_duplicates(), on=['snp', 'gencode_id'], how='left')
     # Get SNP-gene distance
     eqtl_df['sid_chr'] = 'chr'+eqtl_df['sid_chr'].astype(str)
     eqtl_df['distance'] = eqtl_df.apply(
-        lambda row: get_snp_gene_distance(row), axis=1)
+            lambda row: get_snp_gene_distance(row), axis=1)
     eqtl_df['interaction_type'] = eqtl_df.apply(
-        lambda row: label_cis(row), axis=1)
+            lambda row: label_cis(row), axis=1)
     eqtl_df = eqtl_df.rename(
-        columns={'sid_chr': 'snp_chr',
-                 'sid_pos': 'snp_locus',
-                 'pval': 'eqtl_pval',
-                 'b': 'beta',
-                 'b_se': 'beta_se'})
-    if not no_afc:
+            columns={'sid_chr': 'snp_chr',
+                    'sid_pos': 'snp_locus',
+                    'pval': 'eqtl_pval',
+                    'b': 'beta',
+                    'b_se': 'beta_se'})
+
+    if not no_afc and not pchic:
         cols = ['snp', 'gencode_id', 'gene', 'tissue', 'adj_pval',
                 'log2_aFC', 'log2_aFC_lower', 'log2_aFC_upper', 'maf',
                 'interaction_type', 'hic_score']
-    else:
+    elif no_afc and not pchic:
         cols = ['snp', 'gencode_id', 'gene', 'tissue', 'adj_pval',
                 'beta', 'beta_se', 'maf', 'interaction_type', 'hic_score']
+
+    elif not no_afc and pchic:
+        cols = ['snp', 'gencode_id', 'gene', 'tissue', 'adj_pval',
+                'log2_aFC', 'log2_aFC_lower', 'log2_aFC_upper', 'maf',
+                'interaction_type', 'chicago_scores']
+    elif no_afc and pchic:
+        cols = ['snp', 'gencode_id', 'gene', 'tissue', 'adj_pval',
+                'beta', 'beta_se', 'maf', 'interaction_type', 'chicago_scores']
+
+        
     if output_format == 'short':
         eqtl_df[cols].to_csv(os.path.join(output_dir, 'significant_eqtls.txt'), sep='\t',
                              columns=cols, index=False)
@@ -129,11 +158,18 @@ def produce_summary(
     }
     extremes_df = pd.DataFrame(extremes_df)
 
-    eqtl_df = eqtl_df.merge(
-        extremes_df.reset_index(), left_on=['gencode_id'], right_on=['Name'])
-    cols += ['cell_lines', 'cell_line_hic_scores', 'expression',
-             'max_expressed_tissue', 'max_expression',
-             'min_expressed_tissue', 'min_expression']
+    if pchic:
+        eqtl_df = eqtl_df.merge(
+            extremes_df.reset_index(), left_on=['gencode_id'], right_on=['Name'])
+        cols += ['cell_lines', 'N_reads_per_cell_line', 'chicago_scores', 'expression',
+                'max_expressed_tissue', 'max_expression',
+                'min_expressed_tissue', 'min_expression']
+    else:
+        eqtl_df = eqtl_df.merge(
+                extremes_df.reset_index(), left_on=['gencode_id'], right_on=['Name'])
+        cols += ['cell_lines', 'cell_line_hic_scores', 'expression',
+                'max_expressed_tissue', 'max_expression',
+                'min_expressed_tissue', 'min_expression'] 
     eqtl_df = eqtl_df[cols]
     eqtl_df.to_csv(os.path.join(output_dir, 'significant_eqtls.txt'), sep='\t',
                    columns=cols, index=False)
diff --git a/data_preparation/create_index_pchic_sqlite3_db.py b/data_preparation/create_index_pchic_sqlite3_db.py
new file mode 100644
index 0000000..3478cdc
--- /dev/null
+++ b/data_preparation/create_index_pchic_sqlite3_db.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import sqlite3
+import sys
+import os
+
+
+def create_index_db(db_path):
+    
+    for path in db_path:
+        conn = None
+        with sqlite3.connect(path) as conn:
+            cursor = conn.cursor()
+            print("Creating index on tables in database...")
+            cursor.execute('''CREATE INDEX idx_{}_{} ON 
+                interactions (p_fid,oe_fid)'''.format('pfid','oefid'))
+
+    return 
+
+db_fp = '/mnt/projects/codes3d/lib/pchic/HindIII/pchic_data/'
+
+db_path = []
+for cell_lines in os.listdir(db_fp):
+    cell_lines_fp = os.path.join(db_fp,cell_lines)
+    #print(cell_lines_fp)
+    for dbs in os.listdir(cell_lines_fp):
+        dbs_fp = os.path.join(cell_lines_fp,dbs)
+        db_path.append(dbs_fp)
+
+create_index_db(db_path)
diff --git a/data_preparation/init_pchic_gene_tables.py b/data_preparation/init_pchic_gene_tables.py
new file mode 100644
index 0000000..4fa43d8
--- /dev/null
+++ b/data_preparation/init_pchic_gene_tables.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import os
+import sys
+import sqlite3
+import argparse
+from sqlalchemy import create_engine
+import csv
+
+
+def read_promoter_file(promoters):
+    col = ['frag_chr', 'frag_start', 'frag_end', 'frag_id', 'gene', 'project']
+    gene_prom = pd.read_csv(promoters, sep = "\t", names = col, 
+            usecols = ['frag_id', 'gene', 'project'])
+    return gene_prom
+
+def read_genebed(genebed):
+    cols = ['chr', 'start', 'end', 'gene', 'gencode_id']
+    gene_bed = pd.read_csv(genebed, sep = "\t", names = cols)
+    return gene_bed
+
+def build_gene_table(gene_prom, gene_bed, table, db_url, outfile):
+    merge_tables = pd.merge(gene_prom, gene_bed, how='inner', on='gene', sort=False)
+    gene_file= merge_tables[['chr', 'start', 'end', 'gene', 'gencode_id', 
+        'frag_id', 'project']].drop_duplicates()
+    db = create_engine(db_url, echo=False)
+    gene_file.to_csv(outfile, sep='\t', header=True, index=False)
+    gene_file.to_sql(table, con=db, if_exists='replace', index=False)
+    print("Creating index...")
+    db.execute("CREATE INDEX idx_{}_frag_id ON {}(frag_id)".format(table, table))
+    db.execute('''CREATE INDEX idx_{}_fragid_gencodeid on 
+            {}(frag_id,gencode_id)'''.format(table, table))
+    db.execute('''CREATE INDEX idx_{}_fragid_gencodeid_project on
+            {}(frag_id,gencode_id,project)'''.format(table, table))
+    db.execute('''CREATE INDEX idx_{}_gencodeid_project on
+            {}(gencode_id,project)'''.format(table, table)) 
+    db.execute('''CREATE INDEX idx_{}_gencode_id on 
+            {}(gencode_id)'''.format(table, table))
+    print("Done.")
+    return
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description = 
+            'Build gene tables for promoter capture Hi-C data analysis')
+    parser.add_argument('-p', dest = 'pro_file', nargs='+', required = True,
+            help = '''Provide a path to the promoter fragment file (no header).
+            Expected columns in the file: frag_chr, frag_start, frag_end, frag_id, gene_name,
+            project. Multiple files should be speparated by a space''') 
+    parser.add_argument('-g', dest = 'gene_bed', required = True,
+            help='''Path to the gene bed file with extension .bed.
+            e.g. /mnt/projects/codes3d/lib/reference_files/genes/gene_reference.bed''')
+    parser.add_argument("-t", dest = 'table', required = True, 
+            help='Name of table e.g. gene_lookup_pchic_hindiii.')
+    parser.add_argument("-u", dest = 'db_url', required = True,
+            help="URL of database e.g. postgresql://user:password@hostname/database")
+    parser.add_argument("-o", dest = 'outfile', required = True,
+            help="absolute path to the output file")
+    args = parser.parse_args()
+
+    promoters = []
+    if len(args.pro_file) > 1:
+        for pfile in args.pro_file:
+            p_df = read_promoter_file(pfile)
+            promoters.append(p_df)
+        promoters = pd.concat(promoters)
+    else:
+        promoters = read_promoter_file(args.pro_file[0])
+    
+    genebed = read_genebed(args.gene_bed)
+
+    build_gene_table(promoters, 
+            genebed, 
+            args.table, 
+            args.db_url, 
+            args.outfile)
+   
+
diff --git a/data_preparation/init_pchic_meta.py b/data_preparation/init_pchic_meta.py
new file mode 100755
index 0000000..76759bc
--- /dev/null
+++ b/data_preparation/init_pchic_meta.py
@@ -0,0 +1,31 @@
+#! /usr/bin/env python
+
+import os
+import pandas as pd
+import sys
+from sqlalchemy import create_engine
+
+if __name__ == '__main__':
+    '''
+    Create table for meta info on PCHi-C libraries.
+    To be run after placing PCHi-C libraries in
+    '../lib/pchic/<enzyme>/pchic_data/<tissue or cells>/<replicate>.db'
+    '''
+    pchic_fp = os.path.join(os.path.dirname(__file__), '/mnt/projects/codes3d/lib/pchic')
+    enzyme_dict = []
+    for enzyme in os.listdir(pchic_fp):
+        enzyme_fp = os.path.join(pchic_fp, enzyme)
+        for library in os.listdir(os.path.join(enzyme_fp, 'pchic_data')):
+            library_fp = os.path.join(enzyme_fp, 'pchic_data', library)
+            #for library in os.listdir(os.path.join(project_fp)):
+                #library_fp = os.path.join(project_fp,library)
+            for rep in os.listdir(library_fp):
+                enzyme_dict.append((rep.split('.')[0], enzyme, library))
+    replicates = pd.DataFrame(enzyme_dict, columns=['replicate', 'enzyme', 'library'])
+    libraries = replicates.groupby(
+            ['library','enzyme']).size().reset_index(name='rep_count')
+    from_file = pd.read_csv('/mnt/projects/users/sgok603/codes3d/meta_info/meta_pchic.txt', sep="\t")
+    libraries = libraries.merge(from_file, how='left', on=['library','enzyme']).drop_duplicates()
+    db = create_engine(
+            'postgresql://codes3d:codes3d@127.0.0.1/codes3d_commons', echo=False)
+    libraries.drop_duplicates().to_sql('meta_pchic', con=db, if_exists='replace')
diff --git a/data_preparation/init_pchic_sqlite_db.py b/data_preparation/init_pchic_sqlite_db.py
new file mode 100644
index 0000000..b48bc65
--- /dev/null
+++ b/data_preparation/init_pchic_sqlite_db.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import sqlite3
+from sqlite3 import Error
+import os
+import sys
+import argparse
+import time
+
+def read_digested_genome(genome):
+
+    cols = ['fchr','fstart','fend','fid']
+    if genome.endswith('.rmap'):
+        genome_df = pd.read_csv(genome, sep = "\t", header = None, names=cols)
+        #print(genome_df)
+    return genome_df
+
+
+def read_chicago_output_ibeds(inputs):
+    
+    inter_cols = ['b_chr','b_start','b_end','b_name','oe_chr','oe_start','oe_end','oe_name','n_reads','score']
+    input_df = pd.read_csv(inputs, sep = "\t", skiprows=1, names=inter_cols)
+    return input_df
+
+
+def build_pchic_db(inputs, genome, tissue_dirs, reps_dirs, out_dir, db_tblname):
+    
+    genome_df = read_digested_genome(genome)
+    input_df = read_chicago_output_ibeds(inputs)
+
+    bait_df = pd.merge(input_df, genome_df, how='inner', left_on=['b_chr','b_start', 'b_end'],
+                            right_on=['fchr','fstart', 'fend'], sort = False)
+    bait_short_df = bait_df[['b_chr','b_start','b_end','b_name','fid']]
+    bait_short_df.columns = ['p_chr','p_start','p_end','p_name','p_fid']
+ 
+    oe_df = pd.merge(input_df, genome_df, how='inner', left_on=['oe_chr','oe_start', 'oe_end'],
+                          right_on=['fchr','fstart', 'fend'], sort = False)
+    oe_short_df = oe_df[['oe_chr','oe_start','oe_end','oe_name','fid','n_reads','score']]
+    oe_short_df.columns = ['oe_chr','oe_start','oe_end','oe_name','oe_fid','n_reads','score']
+
+    bait_oe_df = pd.concat([bait_short_df, oe_short_df], axis=1, sort=False)
+
+    tissue_dir_basename = os.path.splitext(os.path.basename(tissue_dirs))[0]
+    rep_dir_basename = os.path.splitext(os.path.basename(reps_dirs))[0]
+    db_file = rep_dir_basename+'.db'
+    out_db_fp = os.path.join(out_dir,tissue_dir_basename)
+    
+    if not os.path.exists(out_db_fp):
+        os.makedirs(out_db_fp, exist_ok=True)
+        os.chmod(out_db_fp, 0o777)
+    
+    db_fp = os.path.join(out_db_fp,db_file)
+    conn = None
+    with sqlite3.connect(db_fp) as conn:
+        cursor = conn.cursor()
+        print("Creating database {}".format(db_file))
+        bait_oe_df.to_sql(db_tblname, if_exists='replace', con=conn, index=False)
+        print("Creating index on tables in database {}...".format(db_file))
+        cursor.execute('''CREATE INDEX idx_{}_baits
+        ON {} (p_fid)'''.format(rep_dir_basename, db_tblname))
+        cursor.execute('''CREATE INDEX idx_{}_oes
+        ON {} (oe_fid)'''.format(rep_dir_basename, db_tblname))
+        cursor.execute('''CREATE INDEX idx_{}_pfid_oefid 
+        ON {} (p_fid,oe_fid)'''.format(rep_dir_basename, db_tblname))
+    print("Done!")
+
+ 
+def parse_args():
+    
+    parser = argparse.ArgumentParser(
+             description = "Build SQLite libraries of PCHi-C data for CoDeS3D.")
+    
+    parser.add_argument(
+           '-r', '--rmap-file', required = True,
+           help = '''Bowtie digested hg38 build genome file with an extension (.rmap)
+                  which was used to run CHiCAGO experiment.''')
+
+    parser.add_argument(
+           '-d', '--pchic-dir', required = True,
+           help = '''Provide a superdirectory (i.e. parent) containing tissues subdirectories 
+                  that contain replicates subdirectories (e.g. parent/tissues/replicates_{1..n}/) 
+                  where chicago output .ibed file is residing.''')
+
+    parser.add_argument(
+           '-o', '--output_dir', required = True,
+           help = '''Provide path to the output directory to save PCHi-C databases.
+                  The script will automatically create an output directory if the provided
+                  directory doesn't exist in the path''')
+
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    
+    args = parse_args()
+    start_time = time.time()
+    
+    if not (args.rmap_file or args.pchic_dir) and args.output_dir:
+        message = '''Missing --rmap-file, --pchic-dir and --output_dir. 
+                  See init_pchic_sqlite_db.py -h for more details.'''
+        print(message)
+        sys.exit()
+
+    if args.rmap_file == None:    
+        message = '''Missing parameter... Genome fragment file with extension .rmap is required.
+                   For more details init_pchic_sqlite_db.py -h'''
+        print(message)
+        sys.exit()
+    else:
+        print("\nUsing rmap file in: '%s'\n" %args.rmap_file)
+
+    if args.pchic_dir == None: 
+        message = '''Missing parameter... Directory containing .ibed files from Chicago is required.
+                  For more details init_pchic_sqlite_db.py -h'''
+        print(message)
+        sys.exit()
+
+    if args.output_dir == None:
+
+        message = '''Missing parameter... Output directory to save databases is required.
+                 For more details init_pchic_sqlite_db.py -h'''
+        print(message)
+        sys.exit()    
+    
+    if os.path.exists(args.output_dir): #check is directory exists
+        out_dir = args.output_dir
+        print("PCHi-C databases will be stored in the output directory: '%s'\n" %out_dir)
+    else:
+        out_dir = os.makedirs(args.output_dir, exist_ok = True)
+        print("PCHi-C databases will be stored in the output directory: '%s'\n" %out_dir)
+        
+    db_tblname = 'interactions'
+    for tissues in os.listdir(args.pchic_dir):
+        #print(tissues)
+        tissue_dirs = os.path.join(args.pchic_dir, tissues)
+        #print(tissue_dirs)
+        for reps in os.listdir(tissue_dirs):
+            #print(reps)
+            reps_dirs = os.path.join(tissue_dirs, reps) 
+            #print(reps_dirs)
+            for files in os.listdir(reps_dirs):
+                files_path = os.path.join(reps_dirs, files)
+                #print(files_path)
+                if not files_path.endswith('.ibed'):
+                    pass
+                else:
+                    input_fp = os.path.join(reps_dirs,files_path)
+                    build_pchic_db(input_fp, 
+                                   args.rmap_file,
+                                   tissue_dirs, 
+                                   reps_dirs, 
+                                   out_dir,
+                                   db_tblname)
+    print("All done.")
+    print('Total time elapsed: {:.2f} mins'.format((time.time()-start_time)/60))
diff --git a/data_preparation/init_variant_fragment_table_pchic.py b/data_preparation/init_variant_fragment_table_pchic.py
new file mode 100755
index 0000000..a9800f1
--- /dev/null
+++ b/data_preparation/init_variant_fragment_table_pchic.py
@@ -0,0 +1,98 @@
+import pybedtools
+import pandas as pd
+import sys
+from tqdm import tqdm
+import argparse
+import os
+import time
+from sqlalchemy import create_engine
+
+
+def find_variant_fragments(query_fp, fragment_fp, output_fp, db_url, table):
+    """ Create table of variant fragment IDs. """
+    fragment_bed = pybedtools.BedTool(fragment_fp)
+    desc = 'Finding variant fragments...'
+    bar_format = '{desc}: {n_fmt} {unit}'
+    t = tqdm(total=0, unit='variants', desc=desc, disable=False, bar_format=bar_format)
+    chunksize = 200000
+    cols = ['chr', 'frag_id', 'id', 'variant_id']
+    pybedtools.set_tempdir(os.path.dirname(output_fp))
+    db = create_engine(db_url, echo=False)
+    idx = 1
+    for query_df in pd.read_csv(
+            query_fp, sep='\t', compression='gzip', chunksize=chunksize,
+            usecols=['chr', 'variant_pos', 'variant_id']):
+        query_df = query_df.rename(columns={'variant_pos': 'end'})
+        query_df['id'] = range(idx, idx+len(query_df))
+        query_df['start'] = query_df['end'] - 1
+        query_bed = pybedtools.BedTool.from_dataframe(
+                query_df[['chr', 'start', 'end', 'id', 'variant_id']])
+        df = fragment_bed.intersect(query_bed, wb=True)
+        df = df.to_dataframe(
+                names=['frag_chr', 'frag_start', 'frag_end', 'frag_id',
+                    'chrom', 'start', 'end', 'id', 'variant_id'])
+        cols = ['frag_id', 'chrom', 'id']
+        mode = 'w' if t.total == 0 else 'a'
+        header = 'True' if t.total == 0 else None
+        df[cols].to_csv(output_fp, sep='\t', header=header, mode=mode)
+        if table:
+            if_exists = 'replace' if t.total == 0 else 'append'
+            df[cols].to_sql(table, con=db, if_exists=if_exists, index=False)
+        idx += len(query_df)
+        t.total += len(query_df)
+        t.update(len(query_df))
+    t.close()
+    pybedtools.cleanup(remove_all=True)
+    if not table:
+        return
+    create_index(table, db)
+
+def create_index(table, db):
+    print('Creating index for {}...'.format(table))
+    db.execute(
+        '''CREATE INDEX idx_{}_frag_chrom ON {}(frag_id, chrom) 
+        '''.format(table, table))
+    db.execute(
+        '''CREATE INDEX idx_{}_id ON {}(id) 
+        '''.format(table, table))
+    db.execute(
+        '''CREATE INDEX idx_{}_fragid ON {}(frag_id)
+        '''.format(table, table))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Create a PostgreSQL table of variant or gene fragment IDs.")
+    parser.add_argument(
+        '-f', '--fragments', required=True,
+        help='The filepath to the fragment BED file')
+    parser.add_argument(
+        '-v', '--variants',
+        help=('The filepath to the variant BED file. Must contain chr, ',
+              'variant_pos, and variant_id columns'))
+    parser.add_argument(
+        "-t", "--table",
+        help='Name of table e.g. variant_lookup_pchic_hindiii.')
+    parser.add_argument(
+        "-u", "--db-url", required=False,
+        help='URL of database e.g postgresql://user:password@hostname/database')
+    parser.add_argument(
+        "-c", "--config",
+        default=os.path.join(os.path.dirname(__file__),
+                             "../docs/codes3d.conf"),
+        help="The configuration file to be used for this " +
+        "run (default: codes3d.conf)")
+    parser.add_argument(
+        '-o', '--output', required=True,
+        help='The filepath to output file.')
+    args = parser.parse_args()
+    start_time = time.time()
+    if not args.variants:
+        sys.exit('Variants or gene bed file missing.')
+    if os.path.exists(args.output):
+        os.remove(args.output)
+    if args.variants:
+        find_variant_fragments(
+            args.variants, args.fragments, args.output,
+            args.db_url, args.table)
+    print('Time elapsed: {:.2f} mins'.format(
+        (time.time() - start_time)/60))
diff --git a/data_preparation/tidy_promoter_baitmap_file.py b/data_preparation/tidy_promoter_baitmap_file.py
new file mode 100644
index 0000000..3a85a0c
--- /dev/null
+++ b/data_preparation/tidy_promoter_baitmap_file.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python 
+
+import pandas as pd
+import argparse
+import time
+import os
+import sys
+
+
+def tidy_promoter_file(baits,out_file,p_id):
+    cols = ['chr', 'start', 'end', 'frag_id', 'genes']
+    bait_df = pd.read_csv(baits, sep="\t", names = cols).drop_duplicates()
+    bait_df['genes'] = bait_df['genes'].str.replace(', ', ',')
+    bait_df['genes'] = bait_df['genes'].str.split(',')
+    bait_df_new = bait_df.explode('genes')
+    bait_df_new = bait_df_new.dropna()
+    bait_df_new = bait_df_new.drop_duplicates()
+    bait_df_new['project'] = p_id
+    bait_df_new.sort_values(by = 'genes', inplace=True, ascending=True)
+    bait_df_new.to_csv(out_file, sep='\t', header=False, index=False)
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+            description="Tidy up promoter file for CoDeS3D pipeline")
+
+    parser.add_argument(
+            '-f', '--bait-file', required = True,
+            help = '''Pass a promoter information file with an extension .baitmap 
+            which was used to do CHiCAGO analysis''')
+    
+    parser.add_argument(
+            '-o', '--out-fp', required = True,
+            help = '''Provide path to an output file''')
+
+    parser.add_argument(
+            '-p', '--project-id', type=str, required = True,
+            help = '''Project ID will be appended into the output file
+            which will be used while running the CoDeS3D pipeline.
+            Example: 'InkyungJung2019'. ''')
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    
+    args =  parse_args()
+    if not (args.bait_file or args.out_fp or args.project_id):
+        message = '''One or more of the required parameters are missing.
+        Use tidy_promoter_baitmap_file.py -h for more details.'''
+        print(message)
+        sys.exit()
+
+    tidy_promoter_file(args.bait_file, 
+                       args.out_fp, 
+                       args.project_id)
+
+
diff --git a/docs/codes3d.conf b/docs/codes3d.conf
index ada7dc8..28e0085 100755
--- a/docs/codes3d.conf
+++ b/docs/codes3d.conf
@@ -12,9 +12,9 @@ SNP_BED_DIR: %(SNP_REF_DIR)shuman_9606_b151_GRCh38p7
 RS_MERGE_ARCH: %(SNP_REF_DIR)shuman_9606_b151_GRCh38p7_RsMergeArch.pairs.bcp.gz
 
 HIC_DIR: %(libdir)shic
+PCHIC_DIR: %(libdir)spchic
 EQTL_DATA_DIR: %(libdir)seqtls
 
-
 [GTEx]
 eqtl_dir=../lib/eqtls/GTEx/
 GENE_FP: %(eqtl_dir)sGTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz

From cbdd3ebdd05b40445f9edac1e23a92df62b297fe Mon Sep 17 00:00:00 2001
From: sreemol-gokuladhas <sreemol.gokuladhas@auckland.ac.nz>
Date: Wed, 22 Jun 2022 09:26:20 +1200
Subject: [PATCH 2/2] added a flag to list available pchi-c libraries

---
 codes3d/codes3d.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/codes3d/codes3d.py b/codes3d/codes3d.py
index d8965fd..e2ce4b7 100755
--- a/codes3d/codes3d.py
+++ b/codes3d/codes3d.py
@@ -230,6 +230,14 @@ def list_hic_libraries(db):
             print('{}\t{}'.format(idx + 1, row['library']))
     db.dispose()
 
+def list_pchic_libraries(db):
+    sql = '''SELECT library, tissue FROM meta_pchic'''
+    with db.connect() as con:
+        df = pd.read_sql(sql, con=db)
+        for idx, row in df.drop_duplicates().iterrows():
+            print('{}\t{}'.format(idx + 1, row['library']))
+    db.dispose()
+
 def list_enzymes(db):
     sql = '''SELECT DISTINCT enzyme FROM meta_hic'''
     with db.connect() as con:
@@ -618,6 +626,9 @@ def parse_args():
     parser.add_argument(
         '--list-hic-libraries', action='store_true', default=False,
         help='List available Hi-C libraries.')
+    parser.add_argument(
+        '--list-pchic-libraries', action='store_true', default=False,
+        help='List available PCHi-C libraries.')
     parser.add_argument(
         '--match-tissues', action='append', nargs=argparse.REMAINDER, default=None,
         help='''Try to match eQTL and Hi-C tissue types using space-separated 
@@ -701,6 +712,9 @@ def validate_args(args, commons_db):
     if args.list_hic_libraries:
         list_hic_libraries(commons_db)
         sys.exit()
+    if args.list_pchic_libraries:
+        list_pchic_libraries(commons_db)
+        sys.exit()
     if args.list_enzymes:
         list_enzymes(commons_db)
         sys.exit()