From 31a3f1fcf3aff9e4bc0e5ad8690b200ea3c601dd Mon Sep 17 00:00:00 2001 From: Mingfei Shao <2475897+mfshao@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:52:31 -0500 Subject: [PATCH 1/5] HP-1562 Fix/citation (#2569) * fix platform citation * fix repo citation * fix get repo link --- files/scripts/healdata/heal-cedar-data-ingest.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/files/scripts/healdata/heal-cedar-data-ingest.py b/files/scripts/healdata/heal-cedar-data-ingest.py index aa432dc98..bd59076c1 100644 --- a/files/scripts/healdata/heal-cedar-data-ingest.py +++ b/files/scripts/healdata/heal-cedar-data-ingest.py @@ -283,7 +283,7 @@ def get_related_studies(serial_num, guid, hostname): ] = cedar_record_other_study_websites # setup citations - doi_citation = mds_res["gen3_discovery"]["study_metadata"].get( + doi_citation = mds_res["gen3_discovery"].get( "doi_citation", "" ) mds_res["gen3_discovery"]["study_metadata"]["citation"][ @@ -312,11 +312,9 @@ def get_related_studies(serial_num, guid, hostname): repository.update( {"repository_study_link": repository_study_link} ) - if ( - repository_citation_additional_text - not in repository_citation - ): - repository_citation += repository_citation_additional_text + if (repository.get("repository_study_link", None) and repository_citation_additional_text + not in repository_citation): + repository_citation += repository_citation_additional_text if len(data_repositories): data_repositories[0] = { **data_repositories[0], From 7d8de3aab726b672fb4465e6ed821a7df2b06c29 Mon Sep 17 00:00:00 2001 From: EliseCastle23 <109446148+EliseCastle23@users.noreply.github.com> Date: Thu, 20 Jun 2024 14:04:46 -0600 Subject: [PATCH 2/5] Removing duplicate value (#2573) --- files/squid_whitelist/web_whitelist | 1 - 1 file changed, 1 deletion(-) diff --git a/files/squid_whitelist/web_whitelist b/files/squid_whitelist/web_whitelist index 2c77595ba..241262414 100644 --- a/files/squid_whitelist/web_whitelist +++ b/files/squid_whitelist/web_whitelist @@ -31,7 +31,6 @@ centos.mirrors.hoobly.com centos.mirrors.tds.net centos.mirrors.wvstateu.edu cernvm.cern.ch -charts.bitnami.com charts.helm.sh cloud.r-project.org coredns.github.io From fd5d5bd9a28b3873fc9c3e8bbc0e0ca4ee268234 Mon Sep 17 00:00:00 2001 From: EliseCastle23 <109446148+EliseCastle23@users.noreply.github.com> Date: Thu, 20 Jun 2024 15:43:21 -0600 Subject: [PATCH 3/5] Update web_whitelist (#2574) --- files/squid_whitelist/web_whitelist | 1 + 1 file changed, 1 insertion(+) diff --git a/files/squid_whitelist/web_whitelist b/files/squid_whitelist/web_whitelist index 241262414..1f7de95ec 100644 --- a/files/squid_whitelist/web_whitelist +++ b/files/squid_whitelist/web_whitelist @@ -136,6 +136,7 @@ registry.terraform.io releases.rancher.com rendersnake.googlecode.com repec.org +repo.broadcom.com repo-prod.prod.sagebase.org repo-staging.prod.sagebase.org repo.continuum.io From cbc2b69f07e6c445dc27ce68f4dafe6f228a512c Mon Sep 17 00:00:00 2001 From: Andrew Prokhorenkov Date: Fri, 21 Jun 2024 11:33:41 -0500 Subject: [PATCH 4/5] feat: remove Atlas/WebAPI configuration, not needed, it's exposed via ALB (#2575) --- .../gen3.nginx.conf/ohdsi-atlas-service.conf | 12 ------------ .../gen3.nginx.conf/ohdsi-webapi-service.conf | 12 ------------ 2 files changed, 24 deletions(-) delete mode 100644 kube/services/revproxy/gen3.nginx.conf/ohdsi-atlas-service.conf delete mode 100644 kube/services/revproxy/gen3.nginx.conf/ohdsi-webapi-service.conf diff --git a/kube/services/revproxy/gen3.nginx.conf/ohdsi-atlas-service.conf b/kube/services/revproxy/gen3.nginx.conf/ohdsi-atlas-service.conf deleted file mode 100644 index f482e4824..000000000 --- a/kube/services/revproxy/gen3.nginx.conf/ohdsi-atlas-service.conf +++ /dev/null @@ -1,12 +0,0 @@ - location /ohdsi-atlas/ { - if ($csrf_check !~ ^ok-\S.+$) { - return 403 "failed csrf check"; - } - - set $proxy_service "ohdsi-atlas"; - # upstream is written to logs - set $upstream http://ohdsi-atlas-service.$namespace.svc.cluster.local; - rewrite ^/ohdsi-atlas/(.*) /$1 break; - proxy_pass $upstream; - client_max_body_size 0; - } diff --git a/kube/services/revproxy/gen3.nginx.conf/ohdsi-webapi-service.conf b/kube/services/revproxy/gen3.nginx.conf/ohdsi-webapi-service.conf deleted file mode 100644 index cd0d41f0a..000000000 --- a/kube/services/revproxy/gen3.nginx.conf/ohdsi-webapi-service.conf +++ /dev/null @@ -1,12 +0,0 @@ - location /ohdsi-webapi/ { - if ($csrf_check !~ ^ok-\S.+$) { - return 403 "failed csrf check"; - } - - set $proxy_service "ohdsi-webapi"; - # upstream is written to logs - set $upstream http://ohdsi-webapi-service.$namespace.svc.cluster.local; - rewrite ^/ohdsi-webapi/(.*) /$1 break; - proxy_pass $upstream; - client_max_body_size 0; - } From e5315f49d41fe06dfe7bc379c68f4281fad34b4d Mon Sep 17 00:00:00 2001 From: Mingfei Shao <2475897+mfshao@users.noreply.github.com> Date: Mon, 24 Jun 2024 12:46:51 -0500 Subject: [PATCH 5/5] HP-1521 Feat/refresh ctgov metadata (#2570) * feat: fetch ct.gov metadata * check exception * fix excption * clean up old clinicaltrials_gov * fix check existence * debug * rate limit * debug * debug * fix request * remove unused imports * trigger gh action --- .../healdata/heal-cedar-data-ingest.py | 111 ++++++++++++++++-- 1 file changed, 103 insertions(+), 8 deletions(-) diff --git a/files/scripts/healdata/heal-cedar-data-ingest.py b/files/scripts/healdata/heal-cedar-data-ingest.py index bd59076c1..0e7cf8ef3 100644 --- a/files/scripts/healdata/heal-cedar-data-ingest.py +++ b/files/scripts/healdata/heal-cedar-data-ingest.py @@ -1,6 +1,5 @@ import argparse import copy -import json import sys import requests import pydash @@ -50,6 +49,50 @@ "BioSystics-AP": "https://biosystics-ap.com/assays/assaystudy//", } +CLINICAL_TRIALS_GOV_FIELDS = [ + "NCTId", + "OfficialTitle", + "BriefTitle", + "Acronym", + "StudyType", + "OverallStatus", + "StartDate", + "StartDateType", + "CompletionDate", + "CompletionDateType", + "IsFDARegulatedDrug", + "IsFDARegulatedDevice", + "IsPPSD", + "BriefSummary", + "DetailedDescription", + "Condition", + "DesignAllocation", + "DesignPrimaryPurpose", + "Phase", + "DesignInterventionModel", + "EnrollmentCount", + "EnrollmentType", + "DesignObservationalModel", + "InterventionType", + "PrimaryOutcomeMeasure", + "SecondaryOutcomeMeasure", + "OtherOutcomeMeasure", + "Gender", + "GenderBased", + "MaximumAge", + "MinimumAge", + "IPDSharing", + "IPDSharingTimeFrame", + "IPDSharingAccessCriteria", + "IPDSharingURL", + "SeeAlsoLinkURL", + "AvailIPDURL", + "AvailIPDId", + "AvailIPDComment", + "PatientRegistry", + "DesignTimePerspective", +] + def is_valid_uuid(uuid_to_test, version=4): """ @@ -76,7 +119,11 @@ def is_valid_uuid(uuid_to_test, version=4): def update_filter_metadata(metadata_to_update): # Retain these from existing filters save_filters = ["Common Data Elements"] - filter_metadata = [filter for filter in metadata_to_update["advSearchFilters"] if filter["key"] in save_filters] + filter_metadata = [ + filter + for filter in metadata_to_update["advSearchFilters"] + if filter["key"] in save_filters + ] for metadata_field_key, filter_field_key in FILTER_FIELD_MAPPINGS.items(): filter_field_values = pydash.get(metadata_to_update, metadata_field_key) if filter_field_values: @@ -99,7 +146,12 @@ def update_filter_metadata(metadata_to_update): filter_metadata = pydash.uniq(filter_metadata) metadata_to_update["advSearchFilters"] = filter_metadata # Retain these from existing tags - save_tags = ["Data Repository", "Common Data Elements", "RequiredIDP", "Additional Acknowledgement"] + save_tags = [ + "Data Repository", + "Common Data Elements", + "RequiredIDP", + "Additional Acknowledgement", + ] tags = [tag for tag in metadata_to_update["tags"] if tag["category"] in save_tags] # Add any new tags from advSearchFilters for f in metadata_to_update["advSearchFilters"]: @@ -166,6 +218,21 @@ def get_related_studies(serial_num, guid, hostname): return related_study_result +def get_clinical_trials_gov_metadata(nct_id): + if not nct_id: + return None + ct_metadata = {} + try: + ct_metadata_result = requests.get(f"https://clinicaltrials.gov/api/v2/studies/{nct_id}?fields={'|'.join(CLINICAL_TRIALS_GOV_FIELDS)}") + if ct_metadata_result.status_code != 200: + raise Exception(f"Could not get clinicaltrials.gov metadata, error code {ct_metadata_result.status_code}") + else: + ct_metadata = ct_metadata_result.json() + except Exception as exc: + raise Exception(f"Could not get clinicaltrials.gov metadata: {exc}") from exc + return ct_metadata + + parser = argparse.ArgumentParser() parser.add_argument("--directory", help="CEDAR Directory ID for registering ") @@ -231,7 +298,8 @@ def get_related_studies(serial_num, guid, hostname): for cedar_record in metadata_return["metadata"]["records"]: # get the CEDAR instance id from cedar for querying in our MDS cedar_instance_id = pydash.get( - cedar_record, "metadata_location.cedar_study_level_metadata_template_instance_ID" + cedar_record, + "metadata_location.cedar_study_level_metadata_template_instance_ID", ) if cedar_instance_id is None: print("This record doesn't have CEDAR instance id, skipping...") @@ -246,7 +314,9 @@ def get_related_studies(serial_num, guid, hostname): # the query result key is the record of the metadata. If it doesn't return anything then our query failed. if len(list(mds_res.keys())) == 0 or len(list(mds_res.keys())) > 1: - print(f"Query returned nothing for template_instance_ID={cedar_instance_id}&data=true") + print( + f"Query returned nothing for template_instance_ID={cedar_instance_id}&data=true" + ) continue # get the key for our mds record @@ -273,8 +343,10 @@ def get_related_studies(serial_num, guid, hostname): ).get("other_study_websites", []) # this ensures the nih_application_id, cedar_study_level_metadata_template_instance_ID and study_name are not alterable from CEDAR side del cedar_record["metadata_location"] - cedar_record["minimal_info"]["study_name"] = mds_res["gen3_discovery"]["study_metadata"].get("minimal_info", {}).get( - "study_name", "" + cedar_record["minimal_info"]["study_name"] = ( + mds_res["gen3_discovery"]["study_metadata"] + .get("minimal_info", {}) + .get("study_name", "") ) mds_res["gen3_discovery"]["study_metadata"].update(cedar_record) @@ -342,7 +414,9 @@ def get_related_studies(serial_num, guid, hostname): related_study_result = get_related_studies( serial_num, mds_record_guid, hostname ) - mds_res["gen3_discovery"]["related_studies"] = copy.deepcopy(related_study_result) + mds_res["gen3_discovery"]["related_studies"] = copy.deepcopy( + related_study_result + ) # merge data from cedar that is not study level metadata into a level higher deleted_keys = [] @@ -357,6 +431,27 @@ def get_related_studies(serial_num, guid, hostname): mds_res["gen3_discovery"] ) + clinical_trials_id = None + try: + clinical_trials_id = ( + mds_res["gen3_discovery"]["study_metadata"] + .get("metadata_location", {}) + .get("clinical_trials_study_ID", "") + ) + except Exception: + print("Unable to get clinical_trials_study_ID for study") + if clinical_trials_id: + try: + ct_gov_metadata = get_clinical_trials_gov_metadata(clinical_trials_id) + if ct_gov_metadata: + print(f"Got clinicaltrials.gov metadata for {mds_record_guid} with NCT ID {clinical_trials_id}") + mds_cedar_register_data_body["clinicaltrials_gov"] = copy.deepcopy(ct_gov_metadata) + except Exception as ex: + print(f'{ex}') + # This means the old clinicaltrials_gov section is actually from CEDAR not clinicaltrials.gov, so remove it + elif "clinicaltrials_gov" in mds_cedar_register_data_body: + del mds_cedar_register_data_body["clinicaltrials_gov"] + mds_cedar_register_data_body["gen3_discovery"] = mds_discovery_data_body mds_cedar_register_data_body["_guid_type"] = "discovery_metadata"