Merge branch 'master' into chore/add-create-table-to-hatchery

uc-cdis · Feb 29, 2024 · 1f9afc5 · 1f9afc5
2 parents 6bb29c3 + 6c27fc9
commit 1f9afc5
Showing 1 changed file with 96 additions and 0 deletions.
diff --git a/files/scripts/healdata/heal-cedar-data-ingest.py b/files/scripts/healdata/heal-cedar-data-ingest.py
@@ -35,6 +35,16 @@
     "Buisness Development": "Business Development"
 }
 
+# repository links
+REPOSITORY_STUDY_ID_LINK_TEMPLATE = {
+    "NIDDK Central": "https://repository.niddk.nih.gov/studies/<STUDY_ID>/",
+    "NIDA Data Share": "https://datashare.nida.nih.gov/study/<STUDY_ID>",
+    "NICHD DASH": "https://dash.nichd.nih.gov/study/<STUDY_ID>",
+    "ICPSR": "https://www.icpsr.umich.edu/web/ICPSR/studies/<STUDY_ID>",
+    "BioSystics-AP": "https://biosystics-ap.com/assays/assaystudy/<STUDY_ID>/",
+}
+
+
 # Defines field that we don't want to include in the filters
 OMITTED_VALUES_MAPPING = {
     "study_metadata.human_subject_applicability.gender_applicability": "Not applicable"
@@ -114,6 +124,31 @@ def get_client_token(client_id: str, client_secret: str):
     return token
 
 
+def get_related_studies(serial_num, hostname):
+    related_study_result = []
+
+    if serial_num:
+        mds = requests.get(f"https://revproxy-service/mds/metadata?nih_reporter.project_num_split.serial_num={serial_num}&data=true&limit=2000")
+        if mds.status_code == 200:
+            related_study_metadata = mds.json()
+
+            for (
+                related_study_metadata_key,
+                related_study_metadata_value,
+            ) in related_study_metadata.items():
+                title = (
+                    related_study_metadata_value.get(
+                        "gen3_discovery", {}
+                    )
+                    .get("study_metadata", {})
+                    .get("minimal_info", {})
+                    .get("study_name", "")
+                )
+                link = f"https://{hostname}/portal/discovery/{related_study_metadata_key}/"
+                related_study_result.append({"title": title, "link": link})
+    return related_study_result
+
+
 parser = argparse.ArgumentParser()
 
 parser.add_argument("--directory", help="CEDAR Directory ID for registering ")
@@ -214,6 +249,67 @@ def get_client_token(client_id: str, client_secret: str):
                 mds_res["gen3_discovery"]["study_metadata"].update(cedar_record)
                 mds_res["gen3_discovery"]["study_metadata"]["metadata_location"]["other_study_websites"] = cedar_record_other_study_websites
 
+                # setup citations
+                doi_citation = mds_res["gen3_discovery"]["study_metadata"].get("doi_citation", "")
+                mds_res["gen3_discovery"]["study_metadata"]["citation"]["heal_platform_citation"] = doi_citation
+
+
+                # setup repository_study_link
+                data_repositories = (
+                    mds_res.get("study_metadata", {})
+                    .get("metadata_location", {})
+                    .get("data_repositories", [])
+                )
+                repository_citation = "Users must also include a citation to the data as specified by the local repository."
+                repository_citation_additional_text = ' The link to the study page at the local repository can be found in the "Data" tab.'
+                for repository in data_repositories:
+                    if (
+                        repository["repository_name"]
+                        and repository["repository_name"]
+                        in REPOSITORY_STUDY_ID_LINK_TEMPLATE
+                        and repository["repository_study_ID"]
+                    ):
+                        repository_study_link = REPOSITORY_STUDY_ID_LINK_TEMPLATE[
+                            repository["repository_name"]
+                        ].replace("<STUDY_ID>", repository["repository_study_ID"])
+                        repository.update({"repository_study_link": repository_study_link})
+                        if repository_citation_additional_text not in repository_citation:
+                            repository_citation += repository_citation_additional_text
+                if len(data_repositories):
+                    data_repositories[0] = {
+                        **data_repositories[0],
+                        "repository_citation": repository_citation,
+                    }
+                mds_res["gen3_discovery"]["study_metadata"][
+                    "metadata_location"
+                ]["data_repositories"] = data_repositories
+
+
+
+                # set up related studies
+                serial_num = None
+                try:
+                    serial_num = (
+                        mds_res
+                        .get("nih_reporter", {})
+                        .get("project_num_split", {})
+                        .get("serial_num", None)
+                    )
+                except Exception:
+                    print(f"Unable to get serial number for study")
+
+                if serial_num == None:
+                    print(f"Unable to get serial number for study")
+
+                related_study_result = get_related_studies(serial_num, hostname)
+                existing_related_study_result = mds_res.get("related_studies", [])
+                for related_study in related_study_result:
+                    if related_study not in existing_related_study_result:
+                        existing_related_study_result.append(copy.deepcopy(related_study))
+                mds_res["gen3_discovery"][
+                    "related_studies"
+                ] = copy.deepcopy(existing_related_study_result)
+
                 # merge data from cedar that is not study level metadata into a level higher
                 deleted_keys = []
                 for key, value in mds_res["gen3_discovery"]["study_metadata"].items():