From e2e466d596617dbaee3e7c002c4ab5a7d532e8cf Mon Sep 17 00:00:00 2001 From: Elias Chetouane <elias.chetouane@univ-grenoble-alpes.fr> Date: Fri, 10 May 2024 13:12:28 +0200 Subject: [PATCH] Version fonctionelle avec les colonnes "dois_traveled" et "all_relations" --- 1-enrich-with-datacite/concatenate-enrich-dois.py | 12 ++++++------ 1-enrich-with-datacite/z_personal_functions.py | 14 +++++++++----- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/1-enrich-with-datacite/concatenate-enrich-dois.py b/1-enrich-with-datacite/concatenate-enrich-dois.py index 605175d..111059b 100644 --- a/1-enrich-with-datacite/concatenate-enrich-dois.py +++ b/1-enrich-with-datacite/concatenate-enrich-dois.py @@ -56,17 +56,17 @@ for doi in dois : #[:300] ## if new datasets has been founded if temp_rows : df_fresh = pd.DataFrame(temp_rows) - i_to_drop = [] dois_added = list(df_old["doi"]) to_del = [] for i in range(0, len(df_fresh)): - result = my_functions.get_origin_version(df_fresh.loc[df_fresh.index[i], "doi"]) + result = my_functions.get_origin_version(df_fresh.loc[i, "doi"]) if result[0] not in dois_added: dois_added.append(result[0]) - df_fresh.loc[df_fresh.index[i], "doi"] = result[0] - df_fresh.loc[df_fresh.index[i], "relation_nbInstances"] = result[1] - df_fresh.loc[df_fresh.index[i], "relation_nbCitation"] = result[2] - df_fresh.loc[df_fresh.index[i], "relations_all"] = str(result[3]) + df_fresh.loc[i, "doi"] = result[0] + df_fresh.loc[i, "relation_nbInstances"] = result[1] + df_fresh.loc[i, "relation_nbCitation"] = result[2] + if str(result[3]) != "[]": df_fresh.loc[i, "traveled_dois"] = str(result[3]) + if str(result[4]) != "[]": df_fresh.loc[i, "all_relations"] = str(result[4]) else: to_del.append(i) diff --git a/1-enrich-with-datacite/z_personal_functions.py b/1-enrich-with-datacite/z_personal_functions.py index 5ed1fca..a7ddc7d 100644 --- a/1-enrich-with-datacite/z_personal_functions.py +++ b/1-enrich-with-datacite/z_personal_functions.py @@ -1,10 +1,12 @@ import requests, json -def get_origin_version(doi, count=0, cited=0, history=[]): +def get_origin_version(doi, count=0, cited=0, history=[], first=True): + if first: history=[] # ligne ajoutée pour éviter certains soucis de cache où history n'est pas vide au premier appel de la fonction cited = 0 req = requests.get( f"https://api.datacite.org/dois/{doi}" ) res = req.json() - result = (doi, count, cited, history) + final = [] + result = (doi, count, cited, history, final) try: related = res["data"]["attributes"]["relatedIdentifiers"] except: @@ -13,17 +15,19 @@ def get_origin_version(doi, count=0, cited=0, history=[]): ignore = False duplicate = False for i in related: - history.append([i.get("relationType"), i.get("relatedIdentifier")]) + final.append(i.get("relationType")) if i.get("relationType") == "IsVersionOf" and i.get("relatedIdentifierType") == "DOI": ignore = True elem_to_save_i = i.get("relatedIdentifier") + history.append([i.get("relationType"), i.get("relatedIdentifier")]) if i.get("relationType") == "isCitedBy" and i.get("relatedIdentifierType") == "DOI": cited += 1 if i.get("relationType") == "IsIdenticalTo" and i.get("relatedIdentifierType") == "DOI": duplicate = True elem_to_save_d = i.get("relatedIdentifier") + history.append([i.get("relationType"), i.get("relatedIdentifier")]) if duplicate and not(ignore): - result = (elem_to_save_d, count, cited, history) - if ignore: result = get_origin_version(elem_to_save_i, count+1, cited, history) + result = (elem_to_save_d, count, cited, history, final) + if ignore: result = get_origin_version(elem_to_save_i, count+1, cited, history, False) return result def get_md_from_datacite( doi ) : -- GitLab