From 31a3f1fcf3aff9e4bc0e5ad8690b200ea3c601dd Mon Sep 17 00:00:00 2001
From: Mingfei Shao <2475897+mfshao@users.noreply.github.com>
Date: Tue, 18 Jun 2024 09:52:31 -0500
Subject: [PATCH 1/5] HP-1562 Fix/citation (#2569)

* fix platform citation

* fix repo citation

* fix get repo link
---
 files/scripts/healdata/heal-cedar-data-ingest.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/files/scripts/healdata/heal-cedar-data-ingest.py b/files/scripts/healdata/heal-cedar-data-ingest.py
index aa432dc98..bd59076c1 100644
--- a/files/scripts/healdata/heal-cedar-data-ingest.py
+++ b/files/scripts/healdata/heal-cedar-data-ingest.py
@@ -283,7 +283,7 @@ def get_related_studies(serial_num, guid, hostname):
                 ] = cedar_record_other_study_websites
 
                 # setup citations
-                doi_citation = mds_res["gen3_discovery"]["study_metadata"].get(
+                doi_citation = mds_res["gen3_discovery"].get(
                     "doi_citation", ""
                 )
                 mds_res["gen3_discovery"]["study_metadata"]["citation"][
@@ -312,11 +312,9 @@ def get_related_studies(serial_num, guid, hostname):
                         repository.update(
                             {"repository_study_link": repository_study_link}
                         )
-                        if (
-                            repository_citation_additional_text
-                            not in repository_citation
-                        ):
-                            repository_citation += repository_citation_additional_text
+                    if (repository.get("repository_study_link", None) and repository_citation_additional_text
+                            not in repository_citation):
+                        repository_citation += repository_citation_additional_text
                 if len(data_repositories):
                     data_repositories[0] = {
                         **data_repositories[0],

From 7d8de3aab726b672fb4465e6ed821a7df2b06c29 Mon Sep 17 00:00:00 2001
From: EliseCastle23 <109446148+EliseCastle23@users.noreply.github.com>
Date: Thu, 20 Jun 2024 14:04:46 -0600
Subject: [PATCH 2/5] Removing duplicate value (#2573)

---
 files/squid_whitelist/web_whitelist | 1 -
 1 file changed, 1 deletion(-)

diff --git a/files/squid_whitelist/web_whitelist b/files/squid_whitelist/web_whitelist
index 2c77595ba..241262414 100644
--- a/files/squid_whitelist/web_whitelist
+++ b/files/squid_whitelist/web_whitelist
@@ -31,7 +31,6 @@ centos.mirrors.hoobly.com
 centos.mirrors.tds.net
 centos.mirrors.wvstateu.edu
 cernvm.cern.ch
-charts.bitnami.com
 charts.helm.sh
 cloud.r-project.org
 coredns.github.io

From fd5d5bd9a28b3873fc9c3e8bbc0e0ca4ee268234 Mon Sep 17 00:00:00 2001
From: EliseCastle23 <109446148+EliseCastle23@users.noreply.github.com>
Date: Thu, 20 Jun 2024 15:43:21 -0600
Subject: [PATCH 3/5] Update web_whitelist (#2574)

---
 files/squid_whitelist/web_whitelist | 1 +
 1 file changed, 1 insertion(+)

diff --git a/files/squid_whitelist/web_whitelist b/files/squid_whitelist/web_whitelist
index 241262414..1f7de95ec 100644
--- a/files/squid_whitelist/web_whitelist
+++ b/files/squid_whitelist/web_whitelist
@@ -136,6 +136,7 @@ registry.terraform.io
 releases.rancher.com
 rendersnake.googlecode.com
 repec.org
+repo.broadcom.com
 repo-prod.prod.sagebase.org
 repo-staging.prod.sagebase.org
 repo.continuum.io

From cbc2b69f07e6c445dc27ce68f4dafe6f228a512c Mon Sep 17 00:00:00 2001
From: Andrew Prokhorenkov <aprokh@uchicago.edu>
Date: Fri, 21 Jun 2024 11:33:41 -0500
Subject: [PATCH 4/5] feat: remove Atlas/WebAPI configuration, not needed, it's
 exposed via ALB (#2575)

---
 .../gen3.nginx.conf/ohdsi-atlas-service.conf         | 12 ------------
 .../gen3.nginx.conf/ohdsi-webapi-service.conf        | 12 ------------
 2 files changed, 24 deletions(-)
 delete mode 100644 kube/services/revproxy/gen3.nginx.conf/ohdsi-atlas-service.conf
 delete mode 100644 kube/services/revproxy/gen3.nginx.conf/ohdsi-webapi-service.conf

diff --git a/kube/services/revproxy/gen3.nginx.conf/ohdsi-atlas-service.conf b/kube/services/revproxy/gen3.nginx.conf/ohdsi-atlas-service.conf
deleted file mode 100644
index f482e4824..000000000
--- a/kube/services/revproxy/gen3.nginx.conf/ohdsi-atlas-service.conf
+++ /dev/null
@@ -1,12 +0,0 @@
-          location /ohdsi-atlas/ {
-              if ($csrf_check !~ ^ok-\S.+$) {
-                return 403 "failed csrf check";
-              }
-
-              set $proxy_service  "ohdsi-atlas";
-              # upstream is written to logs
-              set $upstream http://ohdsi-atlas-service.$namespace.svc.cluster.local;
-              rewrite ^/ohdsi-atlas/(.*) /$1 break;
-              proxy_pass $upstream;
-              client_max_body_size 0;
-          }
diff --git a/kube/services/revproxy/gen3.nginx.conf/ohdsi-webapi-service.conf b/kube/services/revproxy/gen3.nginx.conf/ohdsi-webapi-service.conf
deleted file mode 100644
index cd0d41f0a..000000000
--- a/kube/services/revproxy/gen3.nginx.conf/ohdsi-webapi-service.conf
+++ /dev/null
@@ -1,12 +0,0 @@
-          location /ohdsi-webapi/ {
-              if ($csrf_check !~ ^ok-\S.+$) {
-                return 403 "failed csrf check";
-              }
-
-              set $proxy_service  "ohdsi-webapi";
-              # upstream is written to logs
-              set $upstream http://ohdsi-webapi-service.$namespace.svc.cluster.local;
-              rewrite ^/ohdsi-webapi/(.*) /$1 break;
-              proxy_pass $upstream;
-              client_max_body_size 0;
-          }

From e5315f49d41fe06dfe7bc379c68f4281fad34b4d Mon Sep 17 00:00:00 2001
From: Mingfei Shao <2475897+mfshao@users.noreply.github.com>
Date: Mon, 24 Jun 2024 12:46:51 -0500
Subject: [PATCH 5/5] HP-1521 Feat/refresh ctgov metadata (#2570)

* feat: fetch ct.gov metadata

* check exception

* fix excption

* clean up old clinicaltrials_gov

* fix check existence

* debug

* rate limit

* debug

* debug

* fix request

* remove unused imports

* trigger gh action
---
 .../healdata/heal-cedar-data-ingest.py        | 111 ++++++++++++++++--
 1 file changed, 103 insertions(+), 8 deletions(-)

diff --git a/files/scripts/healdata/heal-cedar-data-ingest.py b/files/scripts/healdata/heal-cedar-data-ingest.py
index bd59076c1..0e7cf8ef3 100644
--- a/files/scripts/healdata/heal-cedar-data-ingest.py
+++ b/files/scripts/healdata/heal-cedar-data-ingest.py
@@ -1,6 +1,5 @@
 import argparse
 import copy
-import json
 import sys
 import requests
 import pydash
@@ -50,6 +49,50 @@
     "BioSystics-AP": "https://biosystics-ap.com/assays/assaystudy/<STUDY_ID>/",
 }
 
+CLINICAL_TRIALS_GOV_FIELDS = [
+    "NCTId",
+    "OfficialTitle",
+    "BriefTitle",
+    "Acronym",
+    "StudyType",
+    "OverallStatus",
+    "StartDate",
+    "StartDateType",
+    "CompletionDate",
+    "CompletionDateType",
+    "IsFDARegulatedDrug",
+    "IsFDARegulatedDevice",
+    "IsPPSD",
+    "BriefSummary",
+    "DetailedDescription",
+    "Condition",
+    "DesignAllocation",
+    "DesignPrimaryPurpose",
+    "Phase",
+    "DesignInterventionModel",
+    "EnrollmentCount",
+    "EnrollmentType",
+    "DesignObservationalModel",
+    "InterventionType",
+    "PrimaryOutcomeMeasure",
+    "SecondaryOutcomeMeasure",
+    "OtherOutcomeMeasure",
+    "Gender",
+    "GenderBased",
+    "MaximumAge",
+    "MinimumAge",
+    "IPDSharing",
+    "IPDSharingTimeFrame",
+    "IPDSharingAccessCriteria",
+    "IPDSharingURL",
+    "SeeAlsoLinkURL",
+    "AvailIPDURL",
+    "AvailIPDId",
+    "AvailIPDComment",
+    "PatientRegistry",
+    "DesignTimePerspective",
+]
+
 
 def is_valid_uuid(uuid_to_test, version=4):
     """
@@ -76,7 +119,11 @@ def is_valid_uuid(uuid_to_test, version=4):
 def update_filter_metadata(metadata_to_update):
     # Retain these from existing filters
     save_filters = ["Common Data Elements"]
-    filter_metadata = [filter for filter in metadata_to_update["advSearchFilters"] if filter["key"] in save_filters]
+    filter_metadata = [
+        filter
+        for filter in metadata_to_update["advSearchFilters"]
+        if filter["key"] in save_filters
+    ]
     for metadata_field_key, filter_field_key in FILTER_FIELD_MAPPINGS.items():
         filter_field_values = pydash.get(metadata_to_update, metadata_field_key)
         if filter_field_values:
@@ -99,7 +146,12 @@ def update_filter_metadata(metadata_to_update):
     filter_metadata = pydash.uniq(filter_metadata)
     metadata_to_update["advSearchFilters"] = filter_metadata
     # Retain these from existing tags
-    save_tags = ["Data Repository", "Common Data Elements", "RequiredIDP",  "Additional Acknowledgement"]
+    save_tags = [
+        "Data Repository",
+        "Common Data Elements",
+        "RequiredIDP",
+        "Additional Acknowledgement",
+    ]
     tags = [tag for tag in metadata_to_update["tags"] if tag["category"] in save_tags]
     # Add any new tags from advSearchFilters
     for f in metadata_to_update["advSearchFilters"]:
@@ -166,6 +218,21 @@ def get_related_studies(serial_num, guid, hostname):
     return related_study_result
 
 
+def get_clinical_trials_gov_metadata(nct_id):
+    if not nct_id:
+        return None
+    ct_metadata = {}
+    try:
+        ct_metadata_result = requests.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}?fields={'|'.join(CLINICAL_TRIALS_GOV_FIELDS)}")
+        if ct_metadata_result.status_code != 200:
+            raise Exception(f"Could not get clinicaltrials.gov metadata, error code {ct_metadata_result.status_code}")
+        else:
+            ct_metadata = ct_metadata_result.json()
+    except Exception as exc:
+        raise Exception(f"Could not get clinicaltrials.gov metadata: {exc}") from exc
+    return ct_metadata
+
+
 parser = argparse.ArgumentParser()
 
 parser.add_argument("--directory", help="CEDAR Directory ID for registering ")
@@ -231,7 +298,8 @@ def get_related_studies(serial_num, guid, hostname):
         for cedar_record in metadata_return["metadata"]["records"]:
             # get the CEDAR instance id from cedar for querying in our MDS
             cedar_instance_id = pydash.get(
-                cedar_record, "metadata_location.cedar_study_level_metadata_template_instance_ID"
+                cedar_record,
+                "metadata_location.cedar_study_level_metadata_template_instance_ID",
             )
             if cedar_instance_id is None:
                 print("This record doesn't have CEDAR instance id, skipping...")
@@ -246,7 +314,9 @@ def get_related_studies(serial_num, guid, hostname):
 
                 # the query result key is the record of the metadata. If it doesn't return anything then our query failed.
                 if len(list(mds_res.keys())) == 0 or len(list(mds_res.keys())) > 1:
-                    print(f"Query returned nothing for template_instance_ID={cedar_instance_id}&data=true")
+                    print(
+                        f"Query returned nothing for template_instance_ID={cedar_instance_id}&data=true"
+                    )
                     continue
 
                 # get the key for our mds record
@@ -273,8 +343,10 @@ def get_related_studies(serial_num, guid, hostname):
                 ).get("other_study_websites", [])
                 # this ensures the nih_application_id, cedar_study_level_metadata_template_instance_ID and study_name are not alterable from CEDAR side
                 del cedar_record["metadata_location"]
-                cedar_record["minimal_info"]["study_name"] = mds_res["gen3_discovery"]["study_metadata"].get("minimal_info", {}).get(
-                    "study_name", ""
+                cedar_record["minimal_info"]["study_name"] = (
+                    mds_res["gen3_discovery"]["study_metadata"]
+                    .get("minimal_info", {})
+                    .get("study_name", "")
                 )
 
                 mds_res["gen3_discovery"]["study_metadata"].update(cedar_record)
@@ -342,7 +414,9 @@ def get_related_studies(serial_num, guid, hostname):
                 related_study_result = get_related_studies(
                     serial_num, mds_record_guid, hostname
                 )
-                mds_res["gen3_discovery"]["related_studies"] = copy.deepcopy(related_study_result)
+                mds_res["gen3_discovery"]["related_studies"] = copy.deepcopy(
+                    related_study_result
+                )
 
                 # merge data from cedar that is not study level metadata into a level higher
                 deleted_keys = []
@@ -357,6 +431,27 @@ def get_related_studies(serial_num, guid, hostname):
                     mds_res["gen3_discovery"]
                 )
 
+                clinical_trials_id = None
+                try:
+                    clinical_trials_id = (
+                        mds_res["gen3_discovery"]["study_metadata"]
+                            .get("metadata_location", {})
+                            .get("clinical_trials_study_ID", "")
+                    )
+                except Exception:
+                    print("Unable to get clinical_trials_study_ID for study")
+                if clinical_trials_id:
+                    try:
+                        ct_gov_metadata = get_clinical_trials_gov_metadata(clinical_trials_id)
+                        if ct_gov_metadata:
+                            print(f"Got clinicaltrials.gov metadata for {mds_record_guid} with NCT ID {clinical_trials_id}")
+                            mds_cedar_register_data_body["clinicaltrials_gov"] = copy.deepcopy(ct_gov_metadata)
+                    except Exception as ex:
+                        print(f'{ex}')
+                # This means the old clinicaltrials_gov section is actually from CEDAR not clinicaltrials.gov, so remove it
+                elif "clinicaltrials_gov" in mds_cedar_register_data_body:
+                    del mds_cedar_register_data_body["clinicaltrials_gov"]
+
                 mds_cedar_register_data_body["gen3_discovery"] = mds_discovery_data_body
 
                 mds_cedar_register_data_body["_guid_type"] = "discovery_metadata"