From 2359f2e60455e9be76dce7c30fb374bdc1e1527d Mon Sep 17 00:00:00 2001
From: Tayaza Fadason <t.fadason@auckland.ac.nz>
Date: Thu, 17 Feb 2022 15:40:24 +1300
Subject: [PATCH] Update dependencies.

---
 codes3d/init_eqtl_genotype_db.py |  63 ----------
 codes3d/init_eqtl_meta.py        |  17 ---
 codes3d/init_fragment_db.py      | 161 -------------------------
 codes3d/init_gene_variant_db.py  | 118 ------------------
 codes3d/init_hic_db_postgres.py  | 166 --------------------------
 codes3d/init_hic_db_sqlite.py    | 197 -------------------------------
 codes3d/init_hic_meta.py         |  30 -----
 docs/requirements.txt            |  16 ---
 environment.yaml                 |   2 +-
 9 files changed, 1 insertion(+), 769 deletions(-)
 delete mode 100755 codes3d/init_eqtl_genotype_db.py
 delete mode 100755 codes3d/init_eqtl_meta.py
 delete mode 100755 codes3d/init_fragment_db.py
 delete mode 100755 codes3d/init_gene_variant_db.py
 delete mode 100755 codes3d/init_hic_db_postgres.py
 delete mode 100755 codes3d/init_hic_db_sqlite.py
 delete mode 100755 codes3d/init_hic_meta.py
 delete mode 100755 docs/requirements.txt

diff --git a/codes3d/init_eqtl_genotype_db.py b/codes3d/init_eqtl_genotype_db.py
deleted file mode 100755
index 0096be3..0000000
--- a/codes3d/init_eqtl_genotype_db.py
+++ /dev/null
@@ -1,63 +0,0 @@
-#! /usr/bin/env python
-
-import torch
-from tensorqtl import *
-from tensorqtl import genotypeio, trans
-from tensorqtl import tensorqtl
-import argparse
-import pandas as pd
-from sqlalchemy import create_engine
-import time
-from tqdm import tqdm
-sys.path.insert(
-    1, os.path.join(os.path.abspath(os.path.dirname(__file__)), 'tensorqtl'))
-
-'''Create genotype table for eQTL mapping by tensorqtl
-'''
-
-
-def create_genotype_table(genotype_df, db_url):
-    table = 'genotype'
-    desc = 'Building genotype table...'
-    bar_format = '{desc}: {n_fmt} {unit}'
-    t = tqdm(total=0, unit='variants', desc=desc, disable=False,
-             bar_format=bar_format)
-    chunksize = 50000
-    chunks = [genotype_df[i:i+chunksize]
-              for i in range(0, len(genotype_df), chunksize)]
-    db = create_engine(db_url, echo=False)
-    for df in chunks:
-        if_exists = 'replace' if t.total == 0 else 'append'
-        df.to_sql(table, con=db, if_exists=if_exists)
-        t.total += len(df)
-        t.update(len(df))
-    t.close()
-    # db.execute('''CREATE INDEX idx_{}_snp ON {}
-    # (snp)'''.format(table, table))
-
-
-def read_genotype(plink_prefix):
-    ''' Read WGS VCF '''
-    pr = genotypeio.PlinkReader(plink_prefix)
-    genotype_df = pd.DataFrame(pr.load_genotypes(),
-                               #index=pr.bim['snp'],
-                               columns=pr.fam['iid'])
-    return genotype_df
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='')
-    parser.add_argument(
-        '-p', '--plink-prefix', required=True,
-        help='''Filepath to plink generated files of genotypes but without the 
-        extensions''')
-    parser.add_argument(
-        '-u', '--db-url', required=True,
-        help='URL of database e.g postgresql://user:password@hostname/database')
-    args = parser.parse_args()
-    start_time = time.time()
-    genotype_df = read_genotype(args.plink_prefix)
-    print(genotype_df)
-    create_genotype_table(genotype_df, args.db_url)
-    print('Time elapsed: {:.2f} mins'.format(
-        (time.time() - start_time)/60))
diff --git a/codes3d/init_eqtl_meta.py b/codes3d/init_eqtl_meta.py
deleted file mode 100755
index ee84163..0000000
--- a/codes3d/init_eqtl_meta.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /usr/bin/env python
-
-import os
-import pandas as pd
-import sys
-#import psycopg2
-from sqlalchemy import create_engine
-
-"""
-Create PostgreSQL table of list of eQTL projects. 
-Requires a meta_eqtls.txt
-"""
-eqtl_fp = os.path.join(os.path.dirname(__file__), '../lib/meta_info/meta_eqtls.txt')
-eqtls = pd.read_csv(eqtl_fp, sep='\t')
-db = create_engine(
-    'postgres://codes3d:codes3d@127.0.0.1/codes3d_commons', echo=False)
-eqtls.to_sql('meta_eqtls', con=db, if_exists='replace', index=False)
diff --git a/codes3d/init_fragment_db.py b/codes3d/init_fragment_db.py
deleted file mode 100755
index c6caea2..0000000
--- a/codes3d/init_fragment_db.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import pybedtools
-import pandas as pd
-import sys
-from tqdm import tqdm
-import argparse
-import os
-import time
-from sqlalchemy import create_engine
-import codes3d
-
-
-def find_variant_fragments(
-        query_fp,
-        fragment_fp,
-        output_fp,
-        db_url,
-        table):
-    """ Create table of variant fragment IDs.
-    """
-    fragment_bed = pybedtools.BedTool(fragment_fp)
-    desc = 'Finding variant fragments...'
-    bar_format = '{desc}: {n_fmt} {unit}'
-    t = tqdm(total=0, unit='variants', desc=desc, disable=False,
-             bar_format=bar_format)
-    chunksize = 200000
-    cols = ['chr', 'frag_id', 'id', 'variant_id']
-    pybedtools.set_tempdir(os.path.dirname(output_fp))
-    db = create_engine(db_url, echo=False)
-    idx = 1
-    for query_df in pd.read_csv(
-            query_fp, sep='\t', compression='gzip', chunksize=chunksize,
-            usecols=['chr', 'variant_pos', 'variant_id']):
-        query_df = query_df.rename(columns={'variant_pos': 'end'})
-        query_df['id'] = range(idx, idx+len(query_df))
-        query_df['start'] = query_df['end'] - 1
-        query_bed = pybedtools.BedTool.from_dataframe(
-            query_df[['chr', 'start', 'end', 'id', 'variant_id']])
-        df = fragment_bed.intersect(query_bed, wb=True)
-        df = df.to_dataframe(
-            names=['frag_chr', 'frag_start', 'frag_end', 'frag_id',
-                   'chrom', 'start', 'end', 'id', 'variant_id'])
-        cols = ['frag_id', 'chrom', 'id']
-        mode = 'w' if t.total == 0 else 'a'
-        header = 'True' if t.total == 0 else None
-        df[cols].to_csv(output_fp, sep='\t', header=header, mode=mode)
-        if table:
-            if_exists = 'replace' if t.total == 0 else 'append'
-            df[cols].to_sql(table, con=db, if_exists=if_exists, index=False)
-        idx += len(query_df)
-        t.total += len(query_df)
-        t.update(len(query_df))
-    t.close()
-    pybedtools.cleanup(remove_all=True)
-    if not table:
-        return
-    create_index(table, db)
-
-
-def find_gene_fragments(
-        query_fp,
-        fragment_fp,
-        output_fp,
-        db_url,
-        table):
-    """ Create table of gene fragment IDs.
-    """
-    fragment_bed = pybedtools.BedTool(fragment_fp)
-    desc = 'Finding gene fragments...'
-    bar_format = '{desc}: {n_fmt} {unit}'
-    t = tqdm(total=0, unit='genes', desc=desc, disable=False,
-             bar_format=bar_format)
-    chunksize = 2000
-    pybedtools.set_tempdir(os.path.dirname(output_fp))
-    db = create_engine(db_url, echo=False)
-    for query_df in pd.read_csv(
-            query_fp, sep='\t', compression='infer', chunksize=chunksize
-    ):
-        #query_df.columns = ['id', 'chr', 'start', 'end', 'gene', 'gencode_id']
-        query_bed = pybedtools.BedTool.from_dataframe(
-            query_df[['chrom', 'start', 'end', 'id', 'gencode_id']])
-        #    print(query_bed)
-        df = fragment_bed.intersect(query_bed, wb=True)
-        df = df.to_dataframe(
-            names=['frag_chr', 'frag_start', 'frag_end', 'frag_id',
-                   'chrom', 'start', 'end', 'id', 'gencode_id'])
-        cols = ['frag_id', 'chrom', 'id']
-        mode = 'w' if t.total == 0 else 'a'
-        header = 'True' if t.total == 0 else None
-        df[cols].to_csv(output_fp, sep='\t', header=header,
-                        mode=mode, index=False)
-        if table:
-            if_exists = 'replace' if t.total == 0 else 'append'
-            df[cols].to_sql(table, con=db, if_exists=if_exists, index=False)
-        t.total = len(query_df)
-        t.update(len(query_df))
-    t.close()
-    pybedtools.cleanup(remove_all=True)
-    if not table:
-        return
-    create_index(table, db)
-
-
-def create_index(table, db):
-    print('Creating index for {}...'.format(table))
-    db.execute(
-        '''CREATE INDEX idx_{}_frag_chrom ON {}(frag_id, chrom) 
-        '''.format(table, table))
-    db.execute(
-        '''CREATE INDEX idx_{}_id ON {}(id) 
-        '''.format(table, table))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description="Create s PostgreSQL table of variant or gene fragment IDs.")
-    parser.add_argument(
-        '-f', '--fragments', required=True,
-        help='The filepath to the fragment BED file with chrom, start, end, fragment ID.')
-    parser.add_argument(
-        '-v', '--variants',
-        help='''The filepath to the variant lookup file. Must contain chr,
-        variant_pos, and variant_id columns''')
-    parser.add_argument(
-        '-g', '--genes',
-        help='''The filepath to a sorted gene reference file containing 
-        id, chrom, start, end, name, and gencode_id columns.
-        Note that the id indicates the sorting (chrom, start) 
-        order of the file starting at 1.''')
-    parser.add_argument(
-        '-t', '--table',
-        help='Name of table e.g. variant_lookup_mboi.')
-    parser.add_argument(
-        '-u', '--db-url', required=True,
-        help='URL of database e.g postgresql://user:password@hostname/database')
-    parser.add_argument(
-        '-c', '--config',
-        default=os.path.join(os.path.dirname(__file__),
-                             '../docs/codes3d.conf'),
-        help='''The configuration file to be used for this  
-        run (default: codes3d.conf)''')
-
-    parser.add_argument(
-        '-o', '--output', required=True,
-        help='The filepath to output file.')
-    args = parser.parse_args()
-    c = codes3d.CODES3D(args.config)
-    start_time = time.time()
-    if not args.variants and not args.genes:
-        sys.exit('Variants or gene bed file missing.')
-    if os.path.exists(args.output):
-        os.remove(args.output)
-    if args.variants:
-        find_variant_fragments(
-            args.variants, args.fragments, args.output,
-            args.db_url, args.table)
-    elif args.genes:
-        find_gene_fragments(
-            args.genes, args.fragments, args.output,
-            args.db_url, args.table)
-    print('Time elapsed: {:.2f} mins'.format(
-        (time.time() - start_time)/60))
diff --git a/codes3d/init_gene_variant_db.py b/codes3d/init_gene_variant_db.py
deleted file mode 100755
index 610aa5c..0000000
--- a/codes3d/init_gene_variant_db.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import pandas as pd
-import sys
-from tqdm import tqdm
-import argparse
-import os
-import time
-from sqlalchemy import create_engine
-import codes3d
-
-
-def build_variants_table(
-        variant_fp,
-        db_url,
-        table):
-    """Create a table of genotype. 
-    genotype(snp, sample_id..) 
-    Indexes: 
-    "ix_genotype_snp" btree (snp)
-    """
-    desc = 'Building variants table...'
-    bar_format = '{desc}: {n_fmt} {unit}'
-    t = tqdm(total=0, unit='variants', desc=desc, disable=False,
-             bar_format=bar_format)
-    chunksize = 200000
-    cols = ['chr', 'frag_id', 'id', 'variant_id']
-    db = create_engine(db_url, echo=False)
-    idx = 1
-    for df in pd.read_csv(
-            variant_fp, sep='\t', compression='gzip', chunksize=chunksize):
-        df = df.rename(columns={
-            'chr': 'chrom',
-            'variant_pos': 'locus',
-            'rs_id_dbSNP151_GRCh38p7': 'rsid'
-            }
-        )
-        df['id'] = range(idx, idx+len(df))
-        cols = ['id', 'rsid', 'chrom', 'locus', 'variant_id', 'ref', 'alt', 'num_alt_per_site']
-        if_exists = 'replace' if t.total == 0 else 'append'
-        df[cols].to_sql(table, con=db, if_exists=if_exists, index=False)
-        idx += len(df)
-        t.total += len(df)
-        t.update(len(df))
-    t.close()
-    db.execute('''CREATE INDEX idx_{}_rsid_chrom_locus ON {}
-    (rsid, chrom, locus)'''.format(
-        '{}_{}'.format(table, 'rsid_chrom_locus'), table))
-    db.execute('''CREATE INDEX idx_{}_rsid_chrom_locus ON {}
-    (rsid, chrom, locus)'''.format(
-        '{}_{}'.format(table, 'id'), table))
-
-    
-def build_gene_table(
-        gene_fp,
-        db_url,
-        table):
-    """Create a table of gene info. 
-    gene(id, chrom, start, end, name, gencode_id) 
-    Indexes: 
-    "genes_pk" PRIMARY KEY, btree (id)
-    "genes_name_id" btree (id)
-    "genes_name_index" btree (name)
-    """
-    desc = 'Building gene table...'
-    bar_format = '{desc}: {n_fmt} {unit}'
-    t = tqdm(total=0, unit='genes', desc=desc, disable=False,
-             bar_format=bar_format)
-    chunksize = 2000
-    db = create_engine(db_url, echo=False)
-    for df in pd.read_csv(
-            query_fp, sep='\t', compression='infer', chunksize=chunksize
-    ):
-        cols = ['frag_id', 'chrom', 'id']
-        if_exists = 'replace' if t.total == 0 else 'append'
-        df[cols].to_sql(table, con=db, if_exists=if_exists, index=False)
-        t.total = len(df)
-        t.update(len(df))
-    t.close()
-    create_index(table, db)
-
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Build database tables gene and variant fragments')
-    parser.add_argument(
-        '-v', '--variants',
-        help='The filepath to the variant lookup file')
-    parser.add_argument(
-        '-g', '--genes',
-        help='''The filepath to a sorted gene reference file containing
-              'id, chrom, start, end, name, and gencode_id columns'.
-              Note that the id indicates the sorting (chrom, start)
-              order of the file starting at 1.''')
-    parser.add_argument(
-        '-t', '--table',
-        help='Name of table e.g. variant_lookup_mboi.')
-    parser.add_argument(
-        '-u', '--db-url', required=True,
-        help='''URL of database e.g postgresql://user:password@hostname/database
-        This is usually codes3d_commons''')
-    parser.add_argument(
-        '-c', '--config',
-        default=os.path.join(os.path.dirname(__file__),
-                             '../docs/codes3d.conf'),
-        help='''The configuration file to be used for this
-        run (default: codes3d.conf)''')
-
-    args = parser.parse_args()
-    c = codes3d.CODES3D(args.config)
-    start_time = time.time()
-    if not args.variants and not args.genes:
-        sys.exit('Variants or gene bed file missing.')
-    if args.variants:
-        build_variants_table(args.variants, args.db_url, args.table)
-    elif args.genes:
-        build_gene_table(args.genes, args.db_url, args.table)
-    print('Time elapsed: {:.2f} mins'.format(
-        (time.time() - start_time)/60))
diff --git a/codes3d/init_hic_db_postgres.py b/codes3d/init_hic_db_postgres.py
deleted file mode 100755
index 3c23720..0000000
--- a/codes3d/init_hic_db_postgres.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#!usr/bin/env python
-import os
-import pandas as pd
-from tqdm import tqdm
-import argparse
-from sqlalchemy import create_engine
-from sqlalchemy_utils import create_database, database_exists
-import sys
-import codes3d
-import time
-import gzip
-import sqlite3
-
-def create_db(db_auth, db_url, tblspace):
-    if tblspace and db_auth.startswith('postgres'):
-        db = create_engine(os.path.join(db_auth, 'codes3d'), echo=False)
-        sql = '''CREATE DATABASE {} TABLESPACE {}'''
-        conn = db.connect()
-        conn.execute('commit')
-        conn.execute(sql.format(db_url, tblspace))
-        conn.close()
-    elif db_auth.startswith('sqlite'):
-        create_database(os.path.join(db_auth, db_url))
-    if database_exists(os.path.join(db_auth, db_url)):
-        print('{} database created.'.format(db_url))
-    else:
-        print('FATAL: {} database not created.'.format(db_url))
-        sys.exit()
-
-
-def build_hic_tables(cell_line, enzyme, fp, db_auth, mapq_cutoff, tblspace):
-    start_time = time.time()
-    db_url = 'hic_{}_{}'.format(enzyme.lower(), cell_line.lower())
-    #db_url = os.path.join(db_auth, 'hic_{}_{}'.format(enzyme.lower(), cell_line.lower()))
-    if db_auth.startswith('sqlite'):
-        db_url = db_url + '.db'
-    if not database_exists(os.path.join(db_auth, db_url)):
-        create_db(db_auth, db_url, tblspace)
-    db = create_engine(os.path.join(db_auth, db_url), echo=False)
-    rep = os.path.basename(fp).split('.')[0]
-    rep = rep[:rep.rfind('_merged')].split('_')[-1]
-    desc = '  * {} replicate {}'.format(cell_line, rep)
-    bar_format = ''
-    t = tqdm(total=0, desc=desc, bar_format=bar_format)
-    chunksize = 200000
-    idx = 1
-    #db = create_engine(db_url, echo=False)
-    for df in pd.read_csv(
-            fp, sep=' ', header=None,
-            compression='gzip', chunksize=chunksize):
-        '''
-        Columns are for short format described 
-        https://github.com/aidenlab/juicer/wiki/Pre#medium-format-most-common
-        '''
-        cols = [ 
-            'readname', 'str1', 'chr1', 'pos1', 'frag1', 'str2', 'chr2',
-            'pos2', 'frag2', 'mapq1', 'mapq2']
-        df.columns = cols
-        df = (df.drop(['mapq1', 'mapq2'], axis=1).join( # Remove non-numeric MAPQ as seen in MitDNA
-            df[['mapq1', 'mapq2']].apply(pd.to_numeric, errors='coerce')))
-        df = df[df[['mapq1', 'mapq2']].notnull().all(axis=1)]
-        self_ligation = (  # Filter self-ligating fragments
-            (df['chr1'] == df['chr2']) &\
-            (df['frag1'] == df['frag2']))
-        low_mapq = ( # Filter low quality reads
-            (df['mapq1'] >= mapq_cutoff) & (df['mapq2'] >= mapq_cutoff))
-        db_cols = ['chr1', 'frag1', 'chr2', 'frag2']
-        df = df[~self_ligation & low_mapq][db_cols]
-        df_reverse = df.rename( # Reversing to make SQL queries faster
-            columns={'chr1': 'chr2',
-                     'frag1': 'frag2',
-                     'chr2': 'chr1',
-                     'frag2': 'frag1'})
-        df = pd.concat([
-            df.sort_values(by=['chr1', 'frag1']),
-            df_reverse.sort_values(by=['chr1', 'frag1'])])
-        df[['chr1', 'chr2']] = df[['chr1', 'chr2']].astype(str)
-        patched_chrom = (df['chr1'].str.contains('_'))
-        df = df[~patched_chrom]
-        if_exists = 'replace' if t.total == 0 else 'append'
-        df.to_sql(
-            rep.lower(), con=db, if_exists=if_exists, index=False)
-        t.total += len(df)
-        t.update(len(df))
-    t.close()
-    print('  * Creating index...')
-    # '+', '-' throw a sqlite3.OperationalError
-    cell_index = cell_line.lower().replace('+', '')
-    cell_index = cell_index.replace('-', '_')
-    db.execute(
-        '''CREATE INDEX idx_{}_{}_chr1_frag1 ON {}
-        (chr1, frag1)'''.format(
-            cell_index,
-            rep.lower(),
-            rep.lower()))
-    print('    Time elapsed: {:.2f} mins'.format((time.time()-start_time)/60))
-
-
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Build PostgreSQL database of Hi-C libraries for CoDeS3D.')
-    parser.add_argument(
-        '--hic-dir', required=True,
-        help='''The directory containing (subdirectories containing) 
-        Hi-C non_dups.txt.gz files. Directories should be named by the cell lines.''')
-    parser.add_argument(
-        '-m', '--mapq-cutoff', default=30,
-        help='Mininum mapping quality score (mapq) of hic reads. (Default: 30)')
-    parser.add_argument(
-        "-a", "--db-auth", required=True,
-        help='''Connection string of database e.g 'postgresql://codes3d:<password>@localhost'
-        or 'sqlite:///<path to write database>' ''')
-    parser.add_argument(
-        "-e", "--enzyme", required=True,
-        help='The restriction enzyme used to prepare the Hi-C library')
-    parser.add_argument(
-        "-t", "--tablespace", default=None,
-        help='Tablespace of database.')
-    return parser.parse_args()
-
-    
-
-if __name__ == '__main__':
-    args = parse_args()
-    start_time = time.time()
-    if args.db_auth.startswith('postgres') and not args.tablespace:
-        sys.exit('''PostgreSQL databases require a TABLESPACE to be set.
-        Provide the name of the tablespace with the '--tablespace' flag.
-        See https://www.postgresqltutorial.com/postgresql-create-tablespace/''')
-    logfile = '/tmp/codes3d_db.log'
-    done = []
-    if os.path.exists(logfile):
-        f = open(logfile, 'r')
-        done = [lib.rstrip() for lib in f]
-    print('Building...')    
-    for first_level in os.listdir(args.hic_dir):
-        if os.path.isdir(os.path.join(args.hic_dir, first_level)):
-            for second_level in os.listdir(os.path.join(args.hic_dir, first_level)):
-                cell_line = first_level
-                fp = os.path.join(args.hic_dir, first_level, second_level)
-                if fp in done:
-                    continue
-                build_hic_tables(cell_line, args.enzyme, fp, args.db_auth, args.mapq_cutoff,
-                    args.tablespace)
-                with open(logfile, 'a') as f:
-                    f.write('{}\n'.format(fp))
-        elif os.path.isfile(os.path.join(args.hic_dir, first_level)):
-            cell_line = ''
-            if args.hic_dir.endswith('/'):
-                cell_line = args.hic_dir.strip().split('/')[-2]
-            else:
-                cell_line = args.hic_dir.strip().split('/')[-1]
-            fp = os.path.join(args.hic_dir,first_level)
-            if fp in done:
-                continue
-            build_hic_tables(cell_line, args.enzyme, fp, args.db_auth, args.mapq_cutoff,
-                             args.tablespace)
-            with open(logfile, 'a') as f:
-                f.write('{}\n'.format(fp))
-
-    os.remove(logfile)
-    print('Total time elapsed: {:.2f} mins'.format((time.time()-start_time)/60))
-
-
diff --git a/codes3d/init_hic_db_sqlite.py b/codes3d/init_hic_db_sqlite.py
deleted file mode 100755
index f371189..0000000
--- a/codes3d/init_hic_db_sqlite.py
+++ /dev/null
@@ -1,197 +0,0 @@
-#!usr/bin/env python
-import os
-import pandas as pd
-from tqdm import tqdm
-import argparse
-from sqlalchemy import create_engine
-from sqlalchemy_utils import create_database, database_exists
-import sys
-import codes3d
-
-
-def build_hic_tables(cell_line, enzyme, fp, db_auth, mapq_cutoff, tblspace):
-    db_url = os.path.join(db_auth, 'hic_{}_{}'.format(enzyme.lower(), cell_line.lower()))
-    if db_url.startswith('sqlite'):
-        db_url = db_url + '.db'
-    db = create_engine(db_url, echo=False)
-    if not database_exists(db_url):
-        create_database(db_url)
-        if tblspace and db_url.startswith('postgres'):
-            db.execute(
-                '''ALTER DATABASE {} SET TABLESPACE {}; '''.format(
-                    'hic_{}_{}'.format(db_url), tblspace))
-        print('{} database created.'.format(db_url))
-    rep = os.path.basename(fp).split('.')[0]
-    rep = rep[:rep.rfind('_merged')].split('_')[-1]
-    desc = '  * {} replicate {}'.format(cell_line, rep)
-    bar_format = ''
-    t = tqdm(total=0, desc=desc, bar_format=bar_format)
-    chunksize = 200000
-    idx = 1
-    #db = create_engine(db_url, echo=False)
-    for df in pd.read_csv(
-            fp, sep=' ', header=None,
-            compression='gzip', chunksize=chunksize):
-        '''
-        Columns are for short format described 
-        https://github.com/aidenlab/juicer/wiki/Pre#medium-format-most-common
-        '''
-        cols = [ 
-            'readname', 'str1', 'chr1', 'pos1', 'frag1', 'str2', 'chr2',
-            'pos2', 'frag2', 'mapq1', 'mapq2']
-        df.columns = cols
-        self_ligation = (  # Filter self-ligating fragments
-            (df['chr1'] == df['chr2']) &\
-            (df['frag1'] == df['frag2']))
-        low_mapq = ( # Filter low quality reads
-            (df['mapq1'] >= mapq_cutoff) & (df['mapq2'] >= mapq_cutoff))
-        db_cols = ['chr1', 'frag1', 'chr2', 'frag2']
-        df = df[~self_ligation & low_mapq][db_cols]
-        df_reverse = df.rename( # Reversing to make SQL queries faster
-            columns={'chr1': 'chr2',
-                     'frag1': 'frag2',
-                     'chr2': 'chr1',
-                     'frag2': 'frag1'})
-        df = pd.concat([
-            df.sort_values(by=['chr1', 'frag1']),
-            df_reverse.sort_values(by=['chr1', 'frag1'])])
-        df[['chr1', 'chr2']] = df[['chr1', 'chr2']].astype(str)
-        patched_chrom = (df['chr1'].str.contains('_'))
-        df = df[~patched_chrom]
-        if_exists = 'replace' if t.total == 0 else 'append'
-        df.to_sql(
-            rep.lower(), con=db, if_exists=if_exists, index=False)
-        idx += len(df)
-        t.total += len(df)
-        t.update(len(df))
-    t.close()
-    db.execute(
-        '''CREATE INDEX idx_{}_{}_chr1_frag1 ON {}
-        (chr1, frag1)'''.format(cell_line, rep.lower(), rep.lower()))
-
-
-
-def build_hic_index(
-        input_hic_fp,
-        output_fp=None,
-        chr1_col=3,
-        chr2_col=7,
-        frag1_col=5,
-        frag2_col=9,
-        mapq1_col=10,
-        mapq2_col=11,
-        mapq_cutoff=30,
-        do_not_tidy_up=False):
-    if not output_fp:
-        if not input_hic_fp.rfind('.') == -1:
-            output_fp = input_hic_fp[:input_hic_fp.rfind('.')] + ".db"
-        else:
-            output_fp = input_hic_fp + ".db"
-    if os.path.isfile(output_fp):
-        overwrite = input(
-            "WARNING: Overwriting existing HiC database (%s). Continue? [y/N] " %
-            output_fp)
-        if not upsert.lower() == 'y':
-            print("Did not overwrite existing HiC database.")
-            return
-        os.remove(output_fp)
-    # Do line count for progress meter
-    print("Determining table size...")
-    hic_table = open(input_hic_fp, 'r')
-    lines = 0
-    for i in hic_table:
-        lines += 1
-    hic_table.close()
-    lines = lines // 100 * 100  # Get an approximation
-    do_linecount = not lines == 0
-
-    int_db = sqlite3.connect(output_fp)
-    interactions = int_db.cursor()
-    interactions.execute(
-        "CREATE TABLE IF NOT EXISTS interactions (chr1 text, fragment1 text, chr2 text, fragment2 text)")
-    interactions.execute(
-        "CREATE INDEX IF NOT EXISTS i_1 ON interactions (chr1, fragment1)")
-    chr1_col = int(chr1_col) - 1
-    chr2_col = int(chr2_col) - 1
-    frag1_col = int(frag1_col) - 1
-    frag2_col = int(frag2_col) - 1
-    mapq1_col = int(mapq1_col) - 1
-    mapq2_col = int(mapq2_col) - 1
-    with open(input_hic_fp, 'r') as rao_table:
-        print("Indexing HiC interaction table...")
-        for i, line in enumerate(rao_table):
-            if do_linecount:
-                if i % (lines / 100) == 0:
-                    print("\tProcessed %d%%..." %
-                          ((float(i) / float(lines)) * 100))
-            interaction = line.strip().split(' ')
-            if int(
-                    interaction[mapq1_col]) >= mapq_cutoff and int(
-                    interaction[mapq2_col]) >= mapq_cutoff:
-                chr1 = interaction[chr1_col]
-                frag1 = interaction[frag1_col]
-                chr2 = interaction[chr2_col]
-                frag2 = interaction[frag2_col]
-                interactions.execute(
-                    "INSERT INTO interactions VALUES(?,?,?,?)", [
-                        chr1, frag1, chr2, frag2])
-                interactions.execute(
-                    "INSERT INTO interactions VALUES(?,?,?,?)", [
-                        chr2, frag2, chr1, frag1])
-    int_db.commit()
-    interactions.close()
-    print("Done indexing HiC interaction table.")
-    if not do_not_tidy_up:
-        print("Tidying up...")
-        os.remove(input_hic_fp)
-
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Build SQLite database of  Hi-C libraries for CoDeS3D.')
-    parser.add_argument(
-        '--hic-dir', required=True,
-        help='The directory containing Hi-C non_dups.txt.gz files.')
-    parser.add_argument(
-        '-m', '--mapq-cutoff', default=30,
-        help='Mininum mapping quality score (mapq) of hic reads.')
-    parser.add_argument(
-        "-a", "--db-auth", required=True,
-        help='URL of database e.g postgresql://user:password@hostname')
-    parser.add_argument(
-        "-e", "--enzyme", required=True,
-        help='The restriction enzyme used to prepare the Hi-C library')
-    parser.add_argument(
-        "-t", "--tablespace", default=None,
-        help='Tablespace of database.')
-    parser.add_argument("-c", "--config",
-                        default=os.path.join(os.path.dirname(__file__),
-                                             "../docs/codes3d.conf"),
-                        help="The configuration file to be used for this " +
-                        "run (default: codes3d.conf)")
-
-    return parser.parse_args()
-
-    
-
-if __name__ == '__main__':
-    args = parse_args()
-
-    print('Building...')
-    for first_level in os.listdir(args.hic_dir):
-        if os.path.isdir(os.path.join(args.hic_dir, first_level)):
-            for second_level in os.listdir(os.path.join(args.hic_dir, first_level)):
-                cell_line = first_level
-                fp = os.path.join(args.hic_dir,first_level, second_level)
-            build_hic_tables(cell_line, args.enzyme, fp, args.db_auth, args.mapq_cutoff, args.tablespace)
-        elif os.path.isfile(os.path.join(args.hic_dir, first_level)):
-            cell_line = ''
-            if args.hic_dir.endswith('/'):
-                cell_line = args.hic_dir.strip().split('/')[-2]
-            else:
-                cell_line = args.hic_dir.strip().split('/')[-1]
-            fp = os.path.join(args.hic_dir,first_level)
-            build_hic_tables(cell_line, args.enzyme, fp, args.db_auth, args.mapq_cutoff, args.tablespace)
-                
-                
diff --git a/codes3d/init_hic_meta.py b/codes3d/init_hic_meta.py
deleted file mode 100755
index 76a078a..0000000
--- a/codes3d/init_hic_meta.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#! /usr/bin/env python
-
-import os
-import pandas as pd
-import sys
-from sqlalchemy import create_engine
-
-
-if __name__=='__main__':
-    """Create table for meta info on Hi-C libraries."""
-    hic_fp = os.path.join(os.path.dirname(__file__), '../lib/hic')
-    enzyme_dict = []
-    for enzyme in os.listdir(hic_fp):
-        enzyme_fp = os.path.join(hic_fp, enzyme)
-        for library in os.listdir(os.path.join(enzyme_fp, 'hic_data')):
-            library_fp = os.path.join(enzyme_fp, 'hic_data', library)
-            for rep in os.listdir(library_fp):
-                enzyme_dict.append((rep.split('.')[0], enzyme, library))
-    replicates = pd.DataFrame(
-        enzyme_dict, columns=['replicate', 'enzyme', 'library'])
-    libraries = replicates.groupby(
-        ['library', 'enzyme']).size().reset_index(name='rep_count')
-    from_file = pd.read_csv('/mnt/projects/codes3d/lib/meta_hic.txt', sep='\t')
-    libraries = libraries.merge(
-        from_file, how='left', on=['library', 'enzyme']
-    ).drop_duplicates()
-
-    db = create_engine(
-        'postgres://codes3d:codes3d@127.0.0.1/codes3d_commons', echo=False)
-    libraries.drop_duplicates().to_sql('meta_hic', con=db, if_exists='replace')
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100755
index 03b5bf4..0000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-# python3 version >3.5.2
-pandas == 1.0.5
-numpy == 1.19.1
-pandas-plink == 2.0.4
-psycopg2 == 2.8.5
-requests == 2.24.0
-pybedtools == 0.8.1
-biopython == 1.77
-statsmodels == 0.11.1
-tqdm == 4.48.0
-scikits.bootstrap == 1.0.1
-torch
-pandas_plink == 2.0.4
-sqlalchemy == 1.3.18
-sqlalchemy_utils == 0.36.8
-more_itertools == 8.2.0
diff --git a/environment.yaml b/environment.yaml
index 71ab651..952def2 100755
--- a/environment.yaml
+++ b/environment.yaml
@@ -18,4 +18,4 @@ dependencies:
     - statsmodels #=0.11.1
     - pytorch #=1.9.0
     - tqdm #=4.48.0
-    
+    - more-itertools