From c6a9c768c055f871bad6e2fca42112c3639409fb Mon Sep 17 00:00:00 2001 From: sb Date: Tue, 26 Sep 2023 14:36:20 -0400 Subject: [PATCH 1/2] datatracker analysis module; script for getting working group leadership --- bigbang/analysis/datatracker.py | 78 +++++++++++++++++++ .../Working Group Affiliations.ipynb | 2 +- 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 bigbang/analysis/datatracker.py diff --git a/bigbang/analysis/datatracker.py b/bigbang/analysis/datatracker.py new file mode 100644 index 0000000..3ab10ec --- /dev/null +++ b/bigbang/analysis/datatracker.py @@ -0,0 +1,78 @@ +""" +Scripts for processing data from the IETF DataTracker +""" + +from ietfdata.datatracker import * +from ietfdata.datatracker_ext import * +from dateutil.parser import * + +import pandas as pd +import re + +dt = DataTrackerExt() + +em_re = "/api/v1/person/email/([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7})/" + +def email_from_uri(email_uri): + m = re.match(em_re, email_uri) + + return m.group(1) if m else None + +dt = DataTracker(use_cache=True) + +def get_group_histories(wg_name): + """ + For a working group name, get the group history objects + associated with that working group. + """ + wg = dt.group_from_acronym(wg_name) + group_histories = dt.group_histories(group = wg) + + group_role_histories = [ + dt.group_role_histories(group = grp_hist, + name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/"))) + for grp_hist in group_histories + ] + + return group_histories, group_role_histories + + +def leadership_ranges(group_acronym): + """ + For a working group acronym, + get the data about the changes to the Chair role + in that working group history. + """ + wg = dt.group_from_acronym(group_acronym) + group_histories = dt.group_histories(group = wg) + + gh = list(group_histories) + + gh_chair_records = [ + [{'datetime_max' : h.time, + 'datetime_min' : h.time, + 'email' : email_from_uri(r.email.uri), + 'person_uri' : r.person.uri, + 'name' : dt.person(r.person).name, + 'biography' : dt.person(r.person).biography + } + for r + in list(dt.group_role_histories( + group = h, + name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/")))) + ] + for h in gh] + + gh_chair_records = sum(gh_chair_records, []) + ghcr_df = pd.DataFrame.from_records(gh_chair_records) + + agged = ghcr_df.groupby(['name', 'person_uri', 'email', 'biography']).agg({ + 'datetime_min' : 'min', + 'datetime_max' : 'max' + }) + + agged['datetime_min'].replace({ghcr_df['datetime_min'].min() : None}, inplace = True) + + agged['datetime_max'].replace({ghcr_df['datetime_max'].max() : None}, inplace = True) + + return ghcr_df, agged diff --git a/examples/datatracker/Working Group Affiliations.ipynb b/examples/datatracker/Working Group Affiliations.ipynb index 3a9985a..4e57d89 100644 --- a/examples/datatracker/Working Group Affiliations.ipynb +++ b/examples/datatracker/Working Group Affiliations.ipynb @@ -651,7 +651,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.13" }, "latex_envs": { "LaTeX_envs_menu_present": true, From b4d9ff2eec46759c2910fa340d6af72f36feeb2e Mon Sep 17 00:00:00 2001 From: sb Date: Tue, 10 Oct 2023 11:39:21 -0400 Subject: [PATCH 2/2] black formatting --- bigbang/analysis/datatracker.py | 57 +++++++++++++++++++-------------- bigbang/utils.py | 1 - 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/bigbang/analysis/datatracker.py b/bigbang/analysis/datatracker.py index 3ab10ec..0a1566e 100644 --- a/bigbang/analysis/datatracker.py +++ b/bigbang/analysis/datatracker.py @@ -2,9 +2,9 @@ Scripts for processing data from the IETF DataTracker """ -from ietfdata.datatracker import * +from ietfdata.datatracker import * from ietfdata.datatracker_ext import * -from dateutil.parser import * +from dateutil.parser import * import pandas as pd import re @@ -13,24 +13,29 @@ em_re = "/api/v1/person/email/([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7})/" + def email_from_uri(email_uri): m = re.match(em_re, email_uri) return m.group(1) if m else None + dt = DataTracker(use_cache=True) + def get_group_histories(wg_name): """ For a working group name, get the group history objects associated with that working group. """ wg = dt.group_from_acronym(wg_name) - group_histories = dt.group_histories(group = wg) + group_histories = dt.group_histories(group=wg) group_role_histories = [ - dt.group_role_histories(group = grp_hist, - name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/"))) + dt.group_role_histories( + group=grp_hist, + name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/")), + ) for grp_hist in group_histories ] @@ -44,35 +49,39 @@ def leadership_ranges(group_acronym): in that working group history. """ wg = dt.group_from_acronym(group_acronym) - group_histories = dt.group_histories(group = wg) + group_histories = dt.group_histories(group=wg) gh = list(group_histories) gh_chair_records = [ - [{'datetime_max' : h.time, - 'datetime_min' : h.time, - 'email' : email_from_uri(r.email.uri), - 'person_uri' : r.person.uri, - 'name' : dt.person(r.person).name, - 'biography' : dt.person(r.person).biography - } - for r - in list(dt.group_role_histories( - group = h, - name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/")))) + [ + { + "datetime_max": h.time, + "datetime_min": h.time, + "email": email_from_uri(r.email.uri), + "person_uri": r.person.uri, + "name": dt.person(r.person).name, + "biography": dt.person(r.person).biography, + } + for r in list( + dt.group_role_histories( + group=h, + name=dt.role_name(RoleNameURI("/api/v1/name/rolename/chair/")), + ) + ) ] - for h in gh] + for h in gh + ] gh_chair_records = sum(gh_chair_records, []) ghcr_df = pd.DataFrame.from_records(gh_chair_records) - agged = ghcr_df.groupby(['name', 'person_uri', 'email', 'biography']).agg({ - 'datetime_min' : 'min', - 'datetime_max' : 'max' - }) + agged = ghcr_df.groupby(["name", "person_uri", "email", "biography"]).agg( + {"datetime_min": "min", "datetime_max": "max"} + ) - agged['datetime_min'].replace({ghcr_df['datetime_min'].min() : None}, inplace = True) + agged["datetime_min"].replace({ghcr_df["datetime_min"].min(): None}, inplace=True) - agged['datetime_max'].replace({ghcr_df['datetime_max'].max() : None}, inplace = True) + agged["datetime_max"].replace({ghcr_df["datetime_max"].max(): None}, inplace=True) return ghcr_df, agged diff --git a/bigbang/utils.py b/bigbang/utils.py index 5678736..d27fded 100644 --- a/bigbang/utils.py +++ b/bigbang/utils.py @@ -85,7 +85,6 @@ def get_common_head(str1, str2, delimiter=None): else: # this is ugly control flow clean it if delimiter is not None: - dstr1 = str1.split(delimiter) dstr2 = str2.split(delimiter)