add step 1 and 2

f3ed5704 · Maxence Larrieu · 7b967ddd · f3ed5704 · f3ed5704 · f3ed5704
Commit f3ed5704 authored 1 year ago by Maxence Larrieu
--- a/.gitignore
+++ b/.gitignore
 /0-collect-data/personnal-keys.json
 /0-collect-data/bso/
 /1-enrich-with-datacite/__pycache__/
+/2-produce-graph/__pycache__/
 /hide
\ No newline at end of file
--- a/1-enrich-with-datacite/concatenet-enrich-dois.py
+++ b/1-enrich-with-datacite/concatenet-enrich-dois.py
+import z_personal_functions as my_functions
+import requests, json, random, pandas as pd
+# ______0______ load DOIs and remove duplicate
+## specifier la liste des entrepôts à importer
+repo_list = ["nakala", "bso-via-hal", "datacite", "zenodo", "rdg"] 
+dois_raw = my_functions.from_repos_load_dois(repo_list)
+print("DOIs loaded\t\t\t", len(dois_raw))
+## remove duplicate
+dois = list(set(dois_raw))
+print("DOIs to treat\t\t\t", len(dois))
+# ______1_____ load metadata from dataCite and get specified metadatas
+## pour essayer avec un seul DOI
+# # random doi 10.25656/01:8509
+# temp_doi = dois[random.randint(0, len(dois))]
+# #temp_doi = "10.57745/QYIAWX"
+# print(temp_doi)
+# raw_metadatas = my_functions.get_md_from_datacite(temp_doi)
+doi_error = [] # retrieve doi error
+temp_rows = [] # put data in dict before df
+df_old = pd.read_csv("../dois-uga.csv")
+print(f"\nnb of dois already treated\t{len(df_old)}")
+# req dataCite and paste data following instructions
+for doi in dois : #[:300]
+	## if doi already treated
+	if doi in df_old["doi"].values : 
+		#print(f"\talready treated\t\t{doi}")
+		continue
+	## get md from datacite
+	raw_md = my_functions.get_md_from_datacite(doi)
+	### if doi not in datacite
+	if raw_md == "error" : 
+		doi_error.append(doi)
+		continue
+	## from manual instruction retrieve accurate data
+	selected_md = my_functions.parse_datacite_md(raw_md) ## placer les resultats dans un dictionnaire
+	temp_rows.append(selected_md) ## ajouter ce dictionnaire à une liste
+	print(f"\tadded\t\t{doi}")
+if temp_rows :
+	df_fresh = pd.DataFrame(temp_rows)
+	df_out = pd.concat([df_old, df_fresh], ignore_index=True)
+	df_out.to_csv("../dois-uga.csv", index = False)
+	print(f"\n\nnb of doi exported \t{len(df_out)}")
--- a/1-enrich-with-datacite/datacite-parser-instructions.json
+++ b/1-enrich-with-datacite/datacite-parser-instructions.json
+{
+	"title" : "explique les chemins et les champs des données de datacite à récupérer",
+	"version" : "2023-11-20",
+	"comentaires" : {
+		"0" : "non prise en compte de l'attribut dates",
+		"1" : "non prise en compte de l'attrbut relatedIdentifiers"
+	},
+	"path-and-fields" : {
+		"attributes" : {
+			"titles" : {
+				"type" : "list of dict",
+				"past_first_occ" : "title"
+			},
+			"publisher" : "string",
+			"publicationYear" : "int",
+			"subjects" : {
+				"past_values_w_this_key" : "subject",
+				"flatten_all_in_this_key" : "subject_raw"
+			},
+			"language" : "string",
+			"types" : {
+				"type" : "dict",
+				"past_values_w_this_key" : "resourceTypeGeneral"
+			},
+			"sizes" : "string",
+			"formats" : "string",
+			"rightsList" : {
+				"type" : "list of dict",
+				"past_values_w_this_key" :"rights"
+			},
+			"descriptions" : {
+				"type" : "list of dict",
+				"past_first_occ" : "description"
+			},
+			"geoLocations" : {
+				"flatten_all_in_this_key" : "geoLocations_raw"
+			},
+			"FundingReferences" : {
+				"flatten_all_in_this_key" : "FundingReferences_raw"
+			},
+			"source" : "string",
+			"isActive" : "string",
+			"state" : "string",
+			"viewCount" : "int",
+			"downloadCount" : "int",
+			"referenceCount" : "int",
+			"citationCount" : "int",
+			"versionCount" : "int",
+			"created" : "string",
+			"registered" : "string"
+		},
+		"relationships" : {
+			"client" : {
+				"go_to_sub_key" : "data",
+				"get_key" : "id"
+			},
+			"provider" : {
+				"go_to_sub_key" : "data",
+				"get_key" : "id"
+			}
+		}
+	}
+}
--- a/1-enrich-with-datacite/z_personal_functions.py
+++ b/1-enrich-with-datacite/z_personal_functions.py
+import requests, json
+def get_md_from_datacite( doi ) : 
+    """
+	retrieve data research metadata from datacite
+    """
+    req = requests.get( f"https://api.datacite.org/dois/{doi}" )
+    try : 
+    	res = req.json()
+    except : 
+    	return "error"
+    if "errors" in res : 
+    	return "error"
+    return res   
+def parse_value_following_instruction(key, instruction, datacite_content) : 
+    """
+    permet de récuéprer les données reçues depuis datacite avec leur propre sturcturation selon les instructions précisées manuellement
+    key : la clé à traiter
+    instruction : les instructions à appliquer pour récupérer la valeur de la clé
+    datacite_content : le contenu de datacite à la clé précisé
+    la recherche des instructions est effectuée notamment depuis leur type (string ou dict) : à revoir pour plus de cohérence
+    """
+    buffer = {}  ## pour récupérer les données avec un dictionnaire
+    ## quand les données à récupérer sont des objets simples, on récupère simplement la valeur
+    if instruction == "string" or instruction == "int" :
+        return {key: datacite_content}
+    ## qaund les instructions sont fomratées en dict
+    if isinstance(instruction, dict) :
+        ## si past_values_w_this_key est dans les instructions
+        if "past_values_w_this_key" in instruction :
+            ## pour débbugger 
+            ##print("attribute is", key)
+            ## quand les données sont directement sous forme de dict (eg. datatype)
+            if instruction["past_values_w_this_key"] in datacite_content :
+                temp_key_to_get = instruction["past_values_w_this_key"]
+                buffer.update(
+                    { instruction["past_values_w_this_key"] : datacite_content[temp_key_to_get] }
+                )
+            ## quand les données sont des listes et qu'il faut itérer dessus
+            else : 
+                all_vals = []
+                temp_key_to_get = instruction["past_values_w_this_key"]
+                [all_vals.append( item[ temp_key_to_get ]) for item in datacite_content]
+                buffer.update(
+                    {temp_key_to_get : ",".join(all_vals)}
+                )
+            ## quand il faut sortir toutes les données brutes
+            if "flatten_all_in_this_key" in instruction : 
+                buffer.update(
+                    {instruction["flatten_all_in_this_key"] : str(datacite_content) }
+                )
+        ## if past_first_occ in instruction
+        if "past_first_occ" in instruction : 
+            temp_key_to_get = instruction["past_first_occ"]
+            buffer.update(
+                {temp_key_to_get: datacite_content[0][temp_key_to_get]}
+            )
+        ## if go_to_sub_key in the instruction
+        if "go_to_sub_key" in instruction :
+            temp_parent_key_to_get = instruction["go_to_sub_key"]
+            ## identifier les clés enfantes à retrouver dans dataCite
+            temp_child_key_to_get = instruction["get_key"]
+            buffer.update(
+                {key : datacite_content[temp_parent_key_to_get][temp_child_key_to_get] }
+            )
+    else : 
+        buffer.update(
+            {key : "to_do"}
+        )
+    return buffer     
+def parse_datacite_md(raw_datacite_mds):
+    """
+    from json file load instruction
+    from DOI get datacite cotent
+    iterate on all datacite attributes
+        if data from datacite is needed get it with parse_value_following_instruction()
+    iterate on all datacite relationship
+        if data from datacite is needed get it with parse_value_following_instruction()
+    """
+    doi_md = {
+    "doi" : raw_datacite_mds["data"]["id"]
+    }
+    ## ____0____ from json file load instructions
+    with open("datacite-parser-instructions.json") as parser_file : 
+        datacite_parser = json.load(parser_file)
+    ## liste de tous les attributs à récupérer
+    attributes_to_get = datacite_parser["path-and-fields"]["attributes"].keys()
+    relations_to_get = datacite_parser["path-and-fields"]["relationships"].keys()
+    ## ____1___ iterate on datacite attributes
+    for attribute_key in raw_datacite_mds["data"]["attributes"] :
+        attribute_value = raw_datacite_mds["data"]["attributes"][attribute_key]
+        ## ne pas prendre les valeurs si elles sont nulles (sauf pour les nb)
+        if not isinstance(attribute_value, int) and not attribute_value : 
+            # pour suivi print(f"{attribute_key} is empty")
+            continue
+        ## si l'attribut fait parti de ceux à récupérer
+        if attribute_key in attributes_to_get : 
+            ## redistribuer le nom de la clé et sa valeur
+            value_to_add = parse_value_following_instruction(
+                attribute_key,
+                datacite_parser["path-and-fields"]["attributes"][attribute_key], 
+                attribute_value)
+            doi_md.update(value_to_add)
+    ## ____2___ iterate on datacite relations
+    ### nb : on pourrait alléger en regroupant attribut et relationships
+    for relation_key in raw_datacite_mds["data"]["relationships"] :
+        relation_value = raw_datacite_mds["data"]["relationships"][relation_key]
+        ## ne pas prendre les valeurs si elles sont nulles (sauf pour les nb)
+        if not isinstance(relation_key, int) and not relation_value : 
+            continue
+        ## si la relation est préciser dans les instructions
+        if relation_key in relations_to_get :
+            ## to debug print("relation is", relation_key)
+            relation_to_add = parse_value_following_instruction(
+                relation_key,
+                datacite_parser["path-and-fields"]["relationships"][relation_key], 
+                relation_value
+            )
+            doi_md.update(relation_to_add)
+    return doi_md
+def from_repos_load_dois(repo_list) :
+	"""
+	from repo name (eg Zenodo) load file
+	"""
+	dois_raw = []
+	## load all dois
+	for repo_name in repo_list : 
+		dois_raw += load_dois_from_file(repo_name)
+	return dois_raw
+def load_dois_from_file(repo_name) :
+    """
+	charge les dois des fichiers texts
+	file name syntax "entrepot-dois.txt"
+	"""
+    file_name = repo_name + "-dois.txt"
+    folder_path = "../0-collect-data/"
+    dois_in_repo = []
+    with open(folder_path + file_name) as f:
+        ## attention à la fin des lignes il y a un retour à la ligne, d'où le (line)-1
+        ## on passer les DOI en miniscule également
+        [dois_in_repo.append( line[ :len(line)-1].lower()) for line in f.readlines()]
+    print(f"{repo_name} DOIs \t\t {len(dois_in_repo)}")
+    return dois_in_repo
--- a/2-produce-graph/hist--datasets-by-year.png
+++ b/2-produce-graph/hist--datasets-by-year.png
--- a/2-produce-graph/hist-quantity-by-repo.py
+++ b/2-produce-graph/hist-quantity-by-repo.py
+import pandas as pd, matplotlib, matplotlib.pyplot  as plt
+import z_my_functions as my_fct
+df = my_fct.load_and_treat_csv()
+print(df.columns)
+print("nb de datasets\t", len(df))
+## publicationYear contient une année de 1995 : on reconstruit un champs année à partir de "created" : date d'enregistrement du dépot
+df["year"] = df["created"].str[:4]
+#print( df["year"].value_counts() )
+df_hist = pd.DataFrame( df.groupby(["year"])[["doi"]].agg(["count"])).reset_index()
+## rename columns
+df_hist.columns = ["year", "nb_dois"]
+#print("\n", df_hist)
+## ____N____ do graph
+fig, (ax) = plt.subplots(figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
+ax.bar(df_hist.year, df_hist.nb_dois , align='center', alpha = 1.0, color='#7e96c4', ecolor='black', label="tba")
+plt.xticks(df_hist.year.values)
+## remove axi
+ax.spines['top'].set_visible(False)
+ax.spines['right'].set_visible(False)
+ax.set_ylabel("number of datasets", labelpad = 10)
+plt.title(f"Number of datasets by year", fontsize = 22, x = 0.5, y = 1.03, alpha = 0.6)
+plt.suptitle(f"n = {len(df)}", fontsize = 12, x = 0.5, y = 0.8, alpha = 0.6)
+plt.savefig("hist--datasets-by-year.png")
--- a/2-produce-graph/pie--datacite-client.png
+++ b/2-produce-graph/pie--datacite-client.png
--- a/2-produce-graph/pie-datacite-client.py
+++ b/2-produce-graph/pie-datacite-client.py
+import pandas as pd, matplotlib, matplotlib.pyplot  as plt
+import z_my_functions as my_fct
+import seaborn as sns
+df = my_fct.load_and_treat_csv()
+print(df.columns)
+df_client_raw = df["client"].value_counts()
+## regroup small values in "other"
+treshold = 8
+df_client = df_client_raw[df_client_raw > treshold]
+## remove point in client name (eg. zenodo.cern)
+clients_name = [item[: item.find(".")] for item in df_client.index]
+df_client["other"] = df_client_raw[df_client_raw <= treshold].sum()
+clients_name.append("other")
+#define Seaborn color palette to use
+colors = sns.color_palette('pastel')[0:len(df_client)]
+plt.pie(df_client, labels = clients_name, colors = colors, autopct='%.0f%%')
+plt.title(f"Distribution of datasets by DataCite client", fontsize = 20, x = 0.5, y = 1.03, alpha = 0.6)
+plt.savefig("pie--datacite-client.png")
+# print(len(df))
\ No newline at end of file
--- a/2-produce-graph/z_my_functions.py
+++ b/2-produce-graph/z_my_functions.py
+import pandas as pd
+def load_and_treat_csv() :
+	df_raw = pd.read_csv("../dois-uga.csv", index_col=False)
+	## remove datacite type that are not "research data"
+	type_to_explude = ["Book", "ConferencePaper", "JournalArticle", "BookChapter", "Service", "Preprint"]
+	df = df_raw[ ~df_raw["resourceTypeGeneral"].isin(type_to_explude) ].copy()
+	return df
\ No newline at end of file
--- a/README.md
+++ b/README.md
 # UGA Open Research Data monitor
-> let's describe _open_ research data produced in by Grenoble Alpes University !
+Let's describe _open_ research data produced in by Grenoble Alpes University !
+_in process_
+## Example
+![](2-produce-graph\hist--datasets-by-year.png)
+![](2-produce-graph\pie--datacite-client.png)
 ## Sources

--- a/dois-uga.csv
+++ b/dois-uga.csv
--- a/run-all-codes.py
+++ b/run-all-codes.py
 import subprocess
 def execute_python_file(fila_name):
 	"""
 	excute a py program
@@ -20,4 +19,4 @@ file_names = [
 	"rdg.py"
 ]
 execute_python_file(file_names[1])
\ No newline at end of file