Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add first version of script computing the aggregated stats for a release #19

Merged
merged 8 commits into from
Sep 4, 2020
107 changes: 107 additions & 0 deletions scripts/releases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import argparse
import logging
import os.path
import pandas as pd


def create_release_stats(studies_file, release=None, date=None, size=None):
studies = pd.read_csv(studies_file, delimiter='\t', encoding='utf-8')

df = pd.DataFrame(columns=(
"Date",
"Data release",
"Code version",
"Sets",
"Wells",
"Experiments",
"Images",
"Planes",
"Size (TB)",
"Files (Million)",
"DB Size (GB)",
))

if not release:
release = max(studies['Introduced'])
index = studies['Introduced'] <= release
if not date:
date = "TBD"
if not size:
size = "TBD"
df.loc[0] = (
date,
release,
get_release_code(release),
int(studies[index]['Sets'].sum()),
int(studies[index]['Wells'].sum()),
"",
int(studies[index]['5D Images'].sum()),
int(studies[index]['Planes'].sum()),
studies[index]['Size (TB)'].sum(),
studies[index]['# of Files'].sum() / 10 ** 6,
size)
return(df)


def print_release_stats(df, fmt, target=None):
# fmt can be any of the pandas.Dataframe.to_{printfmt} methods
if fmt == 'tsv':
if target:
out = df.to_csv(
target, sep='\t', mode='a', header=False, index=False)
return
else:
out = df.to_csv(sep='\t', header=False, index=False)
elif fmt in ('json',):
out = getattr(df, f'to_{fmt}')()
else:
out = getattr(df, f'to_{fmt}')(index=False)
print(out)


def get_release_code(relase_name):
number = relase_name[len('prod'):]
patch = int(number) % 10
minor = int((int(number) - patch) / 10)
return "0.%s.%s" % (minor, patch)


def main():
parser = argparse.ArgumentParser("Generate release statistics")
parser.add_argument(
"--release", default=None, help="Name of the release")
parser.add_argument(
"--release-date", default=None, help="Date of the release")
parser.add_argument(
"--db-size", default=None, help="Size of the database for the release")
sbesson marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument("--format", default="tsv", help=(
"Output format, includes 'string', 'csv', 'tsv' (default), and "
"'json'. "
"'tsv' can be appended to the IDR studies.csv file with no further "
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line 80 of the script, which contains the verbose help output, seems misleading. A tsv output can be appended to a studies.csv file... ? Should that not be releases.tsv instead ?

"processing. "
"All other formats include headers and totals. "
sbesson marked this conversation as resolved.
Show resolved Hide resolved
"'string' is the most human readable (fixed width columns). "
sbesson marked this conversation as resolved.
Show resolved Hide resolved
))
parser.add_argument('-v', '--verbose', action='count', default=0)
parser.add_argument(
"studies_file", help="Path to TSV file containing study stats")
ns = parser.parse_args()

levels = [logging.WARNING, logging.INFO, logging.DEBUG]
level = levels[min(len(levels)-1, ns.verbose)]
logging.basicConfig(
level=level, format="%(asctime)s %(levelname)s %(message)s")

df = create_release_stats(
ns.studies_file, release=ns.release, date=ns.release_date,
size=ns.db_size)
releases_file = os.path.join(
os.path.dirname(ns.studies_file), 'releases.tsv')
if os.path.exists(releases_file):
print_release_stats(df, ns.format, target=releases_file)
else:
print_release_stats(df, ns.format)


if __name__ == "__main__":
main()