Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add first version of script computing the aggregated stats for a release #19

Merged
merged 8 commits into from
Sep 4, 2020
108 changes: 108 additions & 0 deletions scripts/releases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import argparse
import logging
import os.path
import pandas as pd


def create_release_stats(studies_file, release=None, date=None, size=None):
studies = pd.read_csv(studies_file, delimiter='\t', encoding='utf-8')

df = pd.DataFrame(columns=(
"Date",
"Data release",
"Code version",
"Sets",
"Wells",
"Experiments",
"Images",
"Planes",
"Size (TB)",
"Files (Million)",
"DB Size (GB)",
))

if not release:
release = max(studies['Introduced'])
index = studies['Introduced'] <= release
if not date:
date = "TBD"
if not size:
size = "TBD"
df.loc[0] = (
date,
release,
get_release_code(release),
int(studies[index]['Sets'].sum()),
int(studies[index]['Wells'].sum()),
"",
int(studies[index]['5D Images'].sum()),
int(studies[index]['Planes'].sum()),
studies[index]['Size (TB)'].sum(),
studies[index]['# of Files'].sum() / 10 ** 6,
size)
return(df)


def print_release_stats(df, fmt, target=None):
# fmt can be any of the pandas.Dataframe.to_{printfmt} methods
if fmt == 'tsv':
if target:
out = df.to_csv(
target, sep='\t', mode='a', header=False, index=False)
return
else:
out = df.to_csv(sep='\t', header=False, index=False)
elif fmt in ('json',):
out = getattr(df, f'to_{fmt}')()
else:
out = getattr(df, f'to_{fmt}')(index=False)
print(out)


def get_release_code(relase_name):
number = relase_name[len('prod'):]
patch = int(number) % 10
minor = int((int(number) - patch) / 10)
return "0.%s.%s" % (minor, patch)


def main():
parser = argparse.ArgumentParser("Generate release statistics")
parser.add_argument(
"--release", default=None, help="Name of the release")
parser.add_argument(
"--release-date", default=None, help="Date of the release")
parser.add_argument(
"--db-size", default=None,
help="Size of the database for the release in GB")
parser.add_argument("--format", default="tsv", help=(
"Output format, includes 'string', 'csv', 'tsv' (default), and "
"'json'. "
"'string' is the most human-readable (fixed width columns). "
"If tsv is selected and a file called releases.tsv exists in the "
"same directory as the file specified by studies_file, the output "
"will be appended to this file."
))
parser.add_argument('-v', '--verbose', action='count', default=0)
parser.add_argument(
"studies_file", help="Path to TSV file containing study stats")
ns = parser.parse_args()

levels = [logging.WARNING, logging.INFO, logging.DEBUG]
level = levels[min(len(levels)-1, ns.verbose)]
logging.basicConfig(
level=level, format="%(asctime)s %(levelname)s %(message)s")

df = create_release_stats(
ns.studies_file, release=ns.release, date=ns.release_date,
size=ns.db_size)
releases_file = os.path.join(
os.path.dirname(ns.studies_file), 'releases.tsv')
if os.path.exists(releases_file):
print_release_stats(df, ns.format, target=releases_file)
else:
print_release_stats(df, ns.format)


if __name__ == "__main__":
main()