Skip to content

Commit

Permalink
Many tests fixed and code refactors.
Browse files Browse the repository at this point in the history
  • Loading branch information
ptth222 committed Oct 9, 2023
1 parent 86682d1 commit 58734ff
Show file tree
Hide file tree
Showing 148 changed files with 39,990 additions and 20,102 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ coverage.xml
.coverage
htmlcov/
README_old.rst
testing_scratch/
testing_scratch/
tests/testing_files/new_intermediate_results/
5 changes: 5 additions & 0 deletions docs/reporting.rst
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,15 @@ Keywords
<last_author>
<authors> Will be replaced with a comma separated list of author names of all authors.
<grants> Will be replaced with a comma separated list of grants associated with the publication.
<queried_sources> Will be replaced with a comma separated list of the sources where information was found for the publication.
Pub Author Keywords - Pulled from the authors section of each publication in the publications.json file.
<pub_author_first>
<pub_author_last>
<pub_author_initials>
<pub_author_affiliations>
<pub_author_ORCID>
<pub_author_id>
Author Keywords - Pulled from the Authors section of the configuration JSON file.
<author_first>
Expand Down Expand Up @@ -377,6 +380,8 @@ Keywords
<pub_author_last> - Collaborator's last name.
<pub_author_initials> - Collaborator's initials.
<pub_author_affiliations> - Collaborator's affiliations.
<pub_author_ORCID> - Collaborator's ORCID.
<pub_author_id> - Collaborator's ID.
Examples
Expand Down
85 changes: 74 additions & 11 deletions src/academic_tracker/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,21 @@
[--save-all-queries]
[--PMID-reference --PMID_reference]
[--MEDLINE-reference --MEDLINE_reference]
[--keep-duplicates]
[--no-Crossref --no_Crossref]
[--no-PubMed --no_PubMed]
[--verbose --silent]
academic_tracker find_ORCID <config_json_file> [--verbose --silent]
academic_tracker find_Google_Scholar <config_json_file> [--verbose --silent]
academic_tracker add_authors <config_json_file> <authors_file> [--verbose --silent]
academic_tracker tokenize_reference <references_file_or_URL> [--MEDLINE-reference --MEDLINE_reference]
[--keep-duplicates]
[--verbose --silent]
academic_tracker gen_reports_and_emails_auth <config_json_file> <publication_json_file> [--test --verbose --silent]
academic_tracker gen_reports_and_emails_ref <config_json_file> <references_file_or_URL> <publication_json_file> [--test]
[--prev-pub=<file-path> --prev_pub=<file-path>]
[--MEDLINE-reference --MEDLINE_reference]
[--keep-duplicates]
[--verbose --silent]
Options:
Expand All @@ -37,6 +40,7 @@
Enter "ignore" for the <file_path> to not look for previous publications.json files in tracker directories.
--prev_pub=<file-path> Deprecated. Use --prev-pub instead.
--save-all-queries Save all queried results from each source in "all_results.json".
--keep-duplicates After references are tokenized duplicate entries are removed, use this option not to remove duplicate entries.
Reference Type Options:
--PMID-reference Indicates that the reference_file is a PMID file and only PubMed info will be returned.
Expand Down Expand Up @@ -82,6 +86,11 @@
VERBOSE = True
SILENT = False

## TODO a
## Make sure in documentation that author affiliation is said to be a newline separated list, was comma, but had to change to match PubMed.
## Change ref and author search to be aware of collective authors, tokenized citations needs to change.
## In the tests for reporting, are the tests using a version of the publication dict that has author affiliations separated with newlines?

def main():

## Have to modify the doc string so docopt can recognize more options than what is written.
Expand Down Expand Up @@ -121,7 +130,8 @@ def main():
args["--no_PubMed"] or args["--no-PubMed"],
args["--test"],
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
args["--save-all-queries"])
args["--save-all-queries"],
not args["--keep-duplicates"])
elif len(sys.argv) > 1 and sys.argv[1] == "find_ORCID":
find_ORCID(args["<config_json_file>"])
elif len(sys.argv) > 1 and sys.argv[1] == "find_Google_Scholar":
Expand All @@ -130,7 +140,8 @@ def main():
add_authors(args["<config_json_file>"], args["<authors_file>"])
elif len(sys.argv) > 1 and sys.argv[1] == "tokenize_reference":
tokenize_reference(args["<references_file_or_URL>"],
args["--MEDLINE_reference"] or args["--MEDLINE-reference"])
args["--MEDLINE_reference"] or args["--MEDLINE-reference"],
not args["--keep-duplicates"])
elif len(sys.argv) > 1 and sys.argv[1] == "gen_reports_and_emails_auth":
gen_reports_and_emails_auth(args["<config_json_file>"], args["<publication_json_file>"], args["--test"])
elif len(sys.argv) > 1 and sys.argv[1] == "gen_reports_and_emails_ref":
Expand All @@ -139,7 +150,8 @@ def main():
args["<publication_json_file>"],
args["--MEDLINE_reference"] or args["--MEDLINE-reference"],
args["--test"],
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"])
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
not args["--keep-duplicates"])
else:
print("Unrecognized command")

Expand Down Expand Up @@ -193,7 +205,7 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,


def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed,
test, prev_pub_filepath, save_all_results):
test, prev_pub_filepath, save_all_results, remove_duplicates):
"""Query PubMed and Crossref for publications matching a reference.
Read in user inputs and check for error, query sources based on inputs, build
Expand All @@ -207,13 +219,15 @@ def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, n
no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
prev_pub_filepath (str or None): filepath to the publication JSON to read in.
save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json"
save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json".
remove_duplicates (bool): if True, remove duplicate entries in tokenized citations.
"""

config_dict, tokenized_citations, has_previous_pubs, prev_pubs = \
ref_srch_modularized.input_reading_and_checking(config_json_filepath, ref_path_or_URL,
MEDLINE_reference, no_Crossref, no_PubMed,
prev_pub_filepath)
prev_pub_filepath,
remove_duplicates)

publication_dict, tokenized_citations, all_queries = ref_srch_modularized.build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed)

Expand Down Expand Up @@ -374,13 +388,53 @@ def add_authors(config_json_filepath, authors_filepath):
if missing_values:
helper_functions.vprint("Error: The following columns have null values:\n" + "\n".join(missing_values))
sys.exit()


if "first_name" in df.columns and not "last_name" in df.columns:
helper_functions.vprint("Error: There is a 'first_name' column without a matching 'last_name' column.")
sys.exit()

if "last_name" in df.columns and not "first_name" in df.columns:
helper_functions.vprint("Error: There is a 'last_name' column without a matching 'first_name' column.")
sys.exit()

if not "last_name" in df.columns and not "first_name" in df.columns and not "collective_name" in df.columns:
helper_functions.vprint("Error: There must be either a 'collective_name' column or 'first_name' and 'last_name' columns.")
sys.exit()


if not "collective_name" in df.columns:
missing_first_or_last_names = df.loc[:, ["first_name", "last_name"]].isnull().any(axis=1)
missing_names_indexes = missing_first_or_last_names[missing_first_or_last_names==True].index.values
else:
missing_collective_names = df.loc[:, "collective_name"].isnull()
if "first_name" in df.columns and "last_name" in df.columns:
missing_first_or_last_names = df.loc[:, ["first_name", "last_name"]].isnull().any(axis=1)
missing_names = missing_collective_names & missing_first_or_last_names
missing_names_indexes = missing_names[missing_names==True].index.values
else:
missing_names_indexes = missing_collective_names[missing_collective_names==True].index.values

if len(missing_names_indexes) > 0:
message = ("Error: The following rows have incomplete name columns:\n" +
"\n".join([str(index+1) for index in missing_names_indexes]) +
"\nEach row must have values in either the 'collective_name' column "
"or the 'first_name' and 'last_name' columns.")
helper_functions.vprint(message)
sys.exit()



for column in required_columns:
df.loc[:, column] = df.loc[:, column].astype(str)

## Assuming all list types are string lists.
author_keys = tracker_schema.config_schema["properties"]["Authors"]["additionalProperties"]["properties"]
list_type_keys = [key for key in author_keys if "type" in author_keys[key] and author_keys[key]["type"] == "array"]
author_keys = tracker_schema.config_schema["properties"]["Authors"]["additionalProperties"]["then"]["properties"]
list_type_keys += [key for key in author_keys if "type" in author_keys[key] and author_keys[key]["type"] == "array"]
author_keys = tracker_schema.config_schema["properties"]["Authors"]["additionalProperties"]["else"]["properties"]
list_type_keys += [key for key in author_keys if "type" in author_keys[key] and author_keys[key]["type"] == "array"]
for key in list_type_keys:
if key in df.columns:
df.loc[:, key] = df.loc[:, key].astype(str)
Expand All @@ -394,19 +448,21 @@ def add_authors(config_json_filepath, authors_filepath):
save_dir_name = "tracker-" + re.sub(r"\-| |\:", "", str(datetime.datetime.now())[2:16])
os.mkdir(save_dir_name)

fileio.save_json_to_file(save_dir_name, "configuration.json", config_dict)
fileio.save_json_to_file(save_dir_name, "configuration.json", config_dict, False)
helper_functions.vprint("Success! configuration.json saved in " + save_dir_name)



def tokenize_reference(ref_path_or_URL, MEDLINE_reference):
def tokenize_reference(ref_path_or_URL, MEDLINE_reference, remove_duplicates):
"""Tokenize input reference file.
Args:
ref_path_or_URL (str): either a filepath to file to tokenize or a URL to tokenize.
MEDLINE_reference (bool): True indicates that ref_path_or_URL is in MEDLINE format.
remove_duplicates (bool): if True, remove duplicate entries in tokenized citations.
"""

tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference)
tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference, remove_duplicates)

report_string = ref_srch_emails_and_reports.create_tokenization_report(tokenized_citations)

Expand Down Expand Up @@ -451,7 +507,13 @@ def gen_reports_and_emails_auth(config_json_filepath, publication_json_filepath,



def gen_reports_and_emails_ref(config_json_filepath, ref_path_or_URL, publication_json_filepath, MEDLINE_reference, test, prev_pub_filepath):
def gen_reports_and_emails_ref(config_json_filepath,
ref_path_or_URL,
publication_json_filepath,
MEDLINE_reference,
test,
prev_pub_filepath,
remove_duplicates):
"""Generate reports and emails for input publications and reference as if reference_search was ran.
Args:
Expand All @@ -461,6 +523,7 @@ def gen_reports_and_emails_ref(config_json_filepath, ref_path_or_URL, publicatio
MEDLINE_reference (bool): If True re_path_or_URL is a filepath to a MEDLINE formatted file.
test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
prev_pub_filepath (str or None): filepath to the publication JSON to read in.
remove_duplicates (bool): if True, remove duplicate entries in tokenized citations.
"""

## read in config file
Expand All @@ -480,7 +543,7 @@ def gen_reports_and_emails_ref(config_json_filepath, ref_path_or_URL, publicatio
if has_previous_pubs:
user_input_checking.prev_pubs_file_check(prev_pubs)

tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference)
tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference, remove_duplicates)
## Read in publications.json
publication_dict = fileio.load_json(publication_json_filepath)
user_input_checking.prev_pubs_file_check(publication_dict)
Expand Down
Loading

0 comments on commit 58734ff

Please sign in to comment.