Many tests fixed and code refactors.

MoseleyBioinformaticsLab · Oct 9, 2023 · 58734ff · 58734ff
1 parent 86682d1
commit 58734ff
Show file tree

Hide file tree

Showing 148 changed files with 39,990 additions and 20,102 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,4 +13,5 @@ coverage.xml
 .coverage
 htmlcov/
 README_old.rst
-testing_scratch/
+testing_scratch/
+tests/testing_files/new_intermediate_results/
diff --git a/docs/reporting.rst b/docs/reporting.rst
@@ -164,12 +164,15 @@ Keywords
     <last_author>
     <authors>              Will be replaced with a comma separated list of author names of all authors.
     <grants>               Will be replaced with a comma separated list of grants associated with the publication.
+    <queried_sources>               Will be replaced with a comma separated list of the sources where information was found for the publication.
     
     Pub Author Keywords - Pulled from the authors section of each publication in the publications.json file.
     <pub_author_first>
     <pub_author_last>
     <pub_author_initials>
     <pub_author_affiliations>
+    <pub_author_ORCID>
+    <pub_author_id>
     
     Author Keywords - Pulled from the Authors section of the configuration JSON file.
     <author_first>
@@ -377,6 +380,8 @@ Keywords
     <pub_author_last>          -  Collaborator's last name.
     <pub_author_initials>      -  Collaborator's initials.
     <pub_author_affiliations>  -  Collaborator's affiliations.
+    <pub_author_ORCID>         -  Collaborator's ORCID.
+    <pub_author_id>            -  Collaborator's ID.
     
 
 Examples

diff --git a/src/academic_tracker/__main__.py b/src/academic_tracker/__main__.py
@@ -13,18 +13,21 @@
                                                                                   [--save-all-queries]
                                                                                   [--PMID-reference --PMID_reference]
                                                                                   [--MEDLINE-reference --MEDLINE_reference]
+                                                                                  [--keep-duplicates]
                                                                                   [--no-Crossref --no_Crossref]
                                                                                   [--no-PubMed --no_PubMed]
                                                                                   [--verbose --silent]
     academic_tracker find_ORCID <config_json_file> [--verbose --silent]
     academic_tracker find_Google_Scholar <config_json_file> [--verbose --silent]
     academic_tracker add_authors <config_json_file> <authors_file> [--verbose --silent]
     academic_tracker tokenize_reference <references_file_or_URL> [--MEDLINE-reference --MEDLINE_reference]
+                                                                 [--keep-duplicates]
                                                                  [--verbose --silent]
     academic_tracker gen_reports_and_emails_auth <config_json_file> <publication_json_file> [--test --verbose --silent]
     academic_tracker gen_reports_and_emails_ref <config_json_file> <references_file_or_URL> <publication_json_file> [--test]
                                                                                                                     [--prev-pub=<file-path> --prev_pub=<file-path>]
                                                                                                                     [--MEDLINE-reference --MEDLINE_reference]
+                                                                                                                    [--keep-duplicates]
                                                                                                                     [--verbose --silent]
     
 Options:
@@ -37,6 +40,7 @@
                                       Enter "ignore" for the <file_path> to not look for previous publications.json files in tracker directories.
     --prev_pub=<file-path>            Deprecated. Use --prev-pub instead.
     --save-all-queries                Save all queried results from each source in "all_results.json".
+    --keep-duplicates                 After references are tokenized duplicate entries are removed, use this option not to remove duplicate entries.
     
 Reference Type Options:    
     --PMID-reference                  Indicates that the reference_file is a PMID file and only PubMed info will be returned.
@@ -82,6 +86,11 @@
 VERBOSE = True
 SILENT = False
 
+## TODO a
+## Make sure in documentation that author affiliation is said to be a newline separated list, was comma, but had to change to match PubMed.
+## Change ref and author search to be aware of collective authors, tokenized citations needs to change.
+## In the tests for reporting, are the tests using a version of the publication dict that has author affiliations separated with newlines?
+
 def main():
 
     ## Have to modify the doc string so docopt can recognize more options than what is written.
@@ -121,7 +130,8 @@ def main():
                              args["--no_PubMed"] or args["--no-PubMed"],
                              args["--test"], 
                              args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
-                             args["--save-all-queries"])
+                             args["--save-all-queries"],
+                             not args["--keep-duplicates"])
     elif len(sys.argv) > 1 and sys.argv[1] == "find_ORCID":
         find_ORCID(args["<config_json_file>"])
     elif len(sys.argv) > 1 and sys.argv[1] == "find_Google_Scholar":
@@ -130,7 +140,8 @@ def main():
         add_authors(args["<config_json_file>"], args["<authors_file>"])
     elif len(sys.argv) > 1 and sys.argv[1] == "tokenize_reference":
         tokenize_reference(args["<references_file_or_URL>"], 
-                           args["--MEDLINE_reference"] or args["--MEDLINE-reference"])
+                           args["--MEDLINE_reference"] or args["--MEDLINE-reference"],
+                           not args["--keep-duplicates"])
     elif len(sys.argv) > 1 and sys.argv[1] == "gen_reports_and_emails_auth":
         gen_reports_and_emails_auth(args["<config_json_file>"], args["<publication_json_file>"], args["--test"])
     elif len(sys.argv) > 1 and sys.argv[1] == "gen_reports_and_emails_ref":
@@ -139,7 +150,8 @@ def main():
                                    args["<publication_json_file>"], 
                                    args["--MEDLINE_reference"] or args["--MEDLINE-reference"], 
                                    args["--test"], 
-                                   args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"])
+                                   args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
+                                   not args["--keep-duplicates"])
     else:
         print("Unrecognized command")  
 
@@ -193,7 +205,7 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,
 
 
 def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, 
-                     test, prev_pub_filepath, save_all_results):
+                     test, prev_pub_filepath, save_all_results, remove_duplicates):
     """Query PubMed and Crossref for publications matching a reference.
     
     Read in user inputs and check for error, query sources based on inputs, build 
@@ -207,13 +219,15 @@ def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, n
         no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
         test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
         prev_pub_filepath (str or None): filepath to the publication JSON to read in.
-        save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json"
+        save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json".
+        remove_duplicates (bool): if True, remove duplicate entries in tokenized citations.
     """
 
     config_dict, tokenized_citations, has_previous_pubs, prev_pubs = \
         ref_srch_modularized.input_reading_and_checking(config_json_filepath, ref_path_or_URL, 
                                                         MEDLINE_reference, no_Crossref, no_PubMed, 
-                                                        prev_pub_filepath)       
+                                                        prev_pub_filepath,
+                                                        remove_duplicates)       
 
     publication_dict, tokenized_citations, all_queries = ref_srch_modularized.build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed)
 
@@ -374,13 +388,53 @@ def add_authors(config_json_filepath, authors_filepath):
     if missing_values:
         helper_functions.vprint("Error: The following columns have null values:\n" + "\n".join(missing_values))
         sys.exit()
+
+
+    if "first_name" in df.columns and not "last_name" in df.columns:
+        helper_functions.vprint("Error: There is a 'first_name' column without a matching 'last_name' column.")
+        sys.exit()
+
+    if "last_name" in df.columns and not "first_name" in df.columns:
+        helper_functions.vprint("Error: There is a 'last_name' column without a matching 'first_name' column.")
+        sys.exit()
+
+    if not "last_name" in df.columns and not "first_name" in df.columns and not "collective_name" in df.columns:
+        helper_functions.vprint("Error: There must be either a 'collective_name' column or 'first_name' and 'last_name' columns.")
+        sys.exit()
+
+
+    if not "collective_name" in df.columns: 
+        missing_first_or_last_names = df.loc[:, ["first_name", "last_name"]].isnull().any(axis=1)
+        missing_names_indexes = missing_first_or_last_names[missing_first_or_last_names==True].index.values
+    else:
+        missing_collective_names = df.loc[:, "collective_name"].isnull()
+        if "first_name" in df.columns and "last_name" in df.columns:
+            missing_first_or_last_names = df.loc[:, ["first_name", "last_name"]].isnull().any(axis=1)
+            missing_names = missing_collective_names & missing_first_or_last_names
+            missing_names_indexes = missing_names[missing_names==True].index.values
+        else:
+            missing_names_indexes = missing_collective_names[missing_collective_names==True].index.values
+
+    if len(missing_names_indexes) > 0:
+        message = ("Error: The following rows have incomplete name columns:\n" +
+                   "\n".join([str(index+1) for index in missing_names_indexes]) +
+                   "\nEach row must have values in either the 'collective_name' column "
+                   "or the 'first_name' and 'last_name' columns.")
+        helper_functions.vprint(message)
+        sys.exit()
+
+
 
     for column in required_columns:
         df.loc[:, column] = df.loc[:, column].astype(str)
 
     ## Assuming all list types are string lists.
     author_keys = tracker_schema.config_schema["properties"]["Authors"]["additionalProperties"]["properties"]
     list_type_keys = [key for key in author_keys if "type" in author_keys[key] and author_keys[key]["type"] == "array"]
+    author_keys = tracker_schema.config_schema["properties"]["Authors"]["additionalProperties"]["then"]["properties"]
+    list_type_keys += [key for key in author_keys if "type" in author_keys[key] and author_keys[key]["type"] == "array"]
+    author_keys = tracker_schema.config_schema["properties"]["Authors"]["additionalProperties"]["else"]["properties"]
+    list_type_keys += [key for key in author_keys if "type" in author_keys[key] and author_keys[key]["type"] == "array"]
     for key in list_type_keys:
         if key in df.columns:
             df.loc[:, key] = df.loc[:, key].astype(str)
@@ -394,19 +448,21 @@ def add_authors(config_json_filepath, authors_filepath):
     save_dir_name = "tracker-" + re.sub(r"\-| |\:", "", str(datetime.datetime.now())[2:16])
     os.mkdir(save_dir_name)
 
-    fileio.save_json_to_file(save_dir_name, "configuration.json", config_dict)
+    fileio.save_json_to_file(save_dir_name, "configuration.json", config_dict, False)
     helper_functions.vprint("Success! configuration.json saved in " + save_dir_name)
 
 
 
-def tokenize_reference(ref_path_or_URL, MEDLINE_reference):
+def tokenize_reference(ref_path_or_URL, MEDLINE_reference, remove_duplicates):
     """Tokenize input reference file.
     
     Args:
         ref_path_or_URL (str): either a filepath to file to tokenize or a URL to tokenize.
+        MEDLINE_reference (bool): True indicates that ref_path_or_URL is in MEDLINE format.
+        remove_duplicates (bool): if True, remove duplicate entries in tokenized citations.
     """
 
-    tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference)
+    tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference, remove_duplicates)
 
     report_string = ref_srch_emails_and_reports.create_tokenization_report(tokenized_citations)
 
@@ -451,7 +507,13 @@ def gen_reports_and_emails_auth(config_json_filepath, publication_json_filepath,
 
 
 
-def gen_reports_and_emails_ref(config_json_filepath, ref_path_or_URL, publication_json_filepath, MEDLINE_reference, test, prev_pub_filepath):
+def gen_reports_and_emails_ref(config_json_filepath, 
+                               ref_path_or_URL, 
+                               publication_json_filepath, 
+                               MEDLINE_reference, 
+                               test, 
+                               prev_pub_filepath,
+                               remove_duplicates):
     """Generate reports and emails for input publications and reference as if reference_search was ran.
     
     Args:
@@ -461,6 +523,7 @@ def gen_reports_and_emails_ref(config_json_filepath, ref_path_or_URL, publicatio
         MEDLINE_reference (bool): If True re_path_or_URL is a filepath to a MEDLINE formatted file.
         test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
         prev_pub_filepath (str or None): filepath to the publication JSON to read in.
+        remove_duplicates (bool): if True, remove duplicate entries in tokenized citations.
     """
 
     ## read in config file
@@ -480,7 +543,7 @@ def gen_reports_and_emails_ref(config_json_filepath, ref_path_or_URL, publicatio
     if has_previous_pubs:
         user_input_checking.prev_pubs_file_check(prev_pubs)
 
-    tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference) 
+    tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference, remove_duplicates) 
     ## Read in publications.json
     publication_dict = fileio.load_json(publication_json_filepath)
     user_input_checking.prev_pubs_file_check(publication_dict)