Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updating the cedar data ingest #2472

Merged
merged 12 commits into from
Feb 29, 2024
Merged
96 changes: 96 additions & 0 deletions files/scripts/healdata/heal-cedar-data-ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@
"Buisness Development": "Business Development"
}

# repository links
REPOSITORY_STUDY_ID_LINK_TEMPLATE = {
"NIDDK Central": "https://repository.niddk.nih.gov/studies/<STUDY_ID>/",
"NIDA Data Share": "https://datashare.nida.nih.gov/study/<STUDY_ID>",
"NICHD DASH": "https://dash.nichd.nih.gov/study/<STUDY_ID>",
"ICPSR": "https://www.icpsr.umich.edu/web/ICPSR/studies/<STUDY_ID>",
"BioSystics-AP": "https://biosystics-ap.com/assays/assaystudy/<STUDY_ID>/",
}


# Defines field that we don't want to include in the filters
OMITTED_VALUES_MAPPING = {
"study_metadata.human_subject_applicability.gender_applicability": "Not applicable"
Expand Down Expand Up @@ -114,6 +124,31 @@ def get_client_token(client_id: str, client_secret: str):
return token


def get_related_studies(serial_num, hostname):
related_study_result = []

if serial_num:
mds = requests.get(f"https://revproxy-service/mds/metadata?nih_reporter.project_num_split.serial_num={serial_num}&data=true&limit=2000")
if mds.status_code == 200:
related_study_metadata = mds.json()

for (
related_study_metadata_key,
related_study_metadata_value,
) in related_study_metadata.items():
title = (
related_study_metadata_value.get(
"gen3_discovery", {}
)
.get("study_metadata", {})
.get("minimal_info", {})
.get("study_name", "")
)
link = f"https://{hostname}/portal/discovery/{related_study_metadata_key}/"
related_study_result.append({"title": title, "link": link})
return related_study_result


parser = argparse.ArgumentParser()

parser.add_argument("--directory", help="CEDAR Directory ID for registering ")
Expand Down Expand Up @@ -214,6 +249,67 @@ def get_client_token(client_id: str, client_secret: str):
mds_res["gen3_discovery"]["study_metadata"].update(cedar_record)
mds_res["gen3_discovery"]["study_metadata"]["metadata_location"]["other_study_websites"] = cedar_record_other_study_websites

# setup citations
doi_citation = mds_res["gen3_discovery"]["study_metadata"].get("doi_citation", "")
mds_res["gen3_discovery"]["study_metadata"]["citation"]["heal_platform_citation"] = doi_citation


# setup repository_study_link
data_repositories = (
mds_res.get("study_metadata", {})
.get("metadata_location", {})
.get("data_repositories", [])
)
repository_citation = "Users must also include a citation to the data as specified by the local repository."
repository_citation_additional_text = ' The link to the study page at the local repository can be found in the "Data" tab.'
for repository in data_repositories:
if (
repository["repository_name"]
and repository["repository_name"]
in REPOSITORY_STUDY_ID_LINK_TEMPLATE
and repository["repository_study_ID"]
):
repository_study_link = REPOSITORY_STUDY_ID_LINK_TEMPLATE[
repository["repository_name"]
].replace("<STUDY_ID>", repository["repository_study_ID"])
repository.update({"repository_study_link": repository_study_link})
if repository_citation_additional_text not in repository_citation:
repository_citation += repository_citation_additional_text
if len(data_repositories):
data_repositories[0] = {
**data_repositories[0],
"repository_citation": repository_citation,
}
mds_res["gen3_discovery"]["study_metadata"][
"metadata_location"
]["data_repositories"] = data_repositories



# set up related studies
serial_num = None
try:
serial_num = (
mds_res
.get("nih_reporter", {})
.get("project_num_split", {})
.get("serial_num", None)
)
except Exception:
print(f"Unable to get serial number for study")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should also print this if serial_num == None


if serial_num == None:
print(f"Unable to get serial number for study")

related_study_result = get_related_studies(serial_num, hostname)
existing_related_study_result = mds_res.get("related_studies", [])
for related_study in related_study_result:
if related_study not in existing_related_study_result:
existing_related_study_result.append(copy.deepcopy(related_study))
mds_res["gen3_discovery"][
"related_studies"
] = copy.deepcopy(existing_related_study_result)

# merge data from cedar that is not study level metadata into a level higher
deleted_keys = []
for key, value in mds_res["gen3_discovery"]["study_metadata"].items():
Expand Down
Loading