Skip to content
Snippets Groups Projects
Commit f3ed5704 authored by Maxence Larrieu's avatar Maxence Larrieu
Browse files

add step 1 and 2

parent 7b967ddd
No related branches found
No related tags found
No related merge requests found
/0-collect-data/personnal-keys.json /0-collect-data/personnal-keys.json
/0-collect-data/bso/ /0-collect-data/bso/
/1-enrich-with-datacite/__pycache__/ /1-enrich-with-datacite/__pycache__/
/2-produce-graph/__pycache__/
/hide /hide
\ No newline at end of file
import z_personal_functions as my_functions
import requests, json, random, pandas as pd
# ______0______ load DOIs and remove duplicate
## specifier la liste des entrepôts à importer
repo_list = ["nakala", "bso-via-hal", "datacite", "zenodo", "rdg"]
dois_raw = my_functions.from_repos_load_dois(repo_list)
print("DOIs loaded\t\t\t", len(dois_raw))
## remove duplicate
dois = list(set(dois_raw))
print("DOIs to treat\t\t\t", len(dois))
# ______1_____ load metadata from dataCite and get specified metadatas
## pour essayer avec un seul DOI
# # random doi 10.25656/01:8509
# temp_doi = dois[random.randint(0, len(dois))]
# #temp_doi = "10.57745/QYIAWX"
# print(temp_doi)
# raw_metadatas = my_functions.get_md_from_datacite(temp_doi)
doi_error = [] # retrieve doi error
temp_rows = [] # put data in dict before df
df_old = pd.read_csv("../dois-uga.csv")
print(f"\nnb of dois already treated\t{len(df_old)}")
# req dataCite and paste data following instructions
for doi in dois : #[:300]
## if doi already treated
if doi in df_old["doi"].values :
#print(f"\talready treated\t\t{doi}")
continue
## get md from datacite
raw_md = my_functions.get_md_from_datacite(doi)
### if doi not in datacite
if raw_md == "error" :
doi_error.append(doi)
continue
## from manual instruction retrieve accurate data
selected_md = my_functions.parse_datacite_md(raw_md) ## placer les resultats dans un dictionnaire
temp_rows.append(selected_md) ## ajouter ce dictionnaire à une liste
print(f"\tadded\t\t{doi}")
if temp_rows :
df_fresh = pd.DataFrame(temp_rows)
df_out = pd.concat([df_old, df_fresh], ignore_index=True)
df_out.to_csv("../dois-uga.csv", index = False)
print(f"\n\nnb of doi exported \t{len(df_out)}")
{
"title" : "explique les chemins et les champs des données de datacite à récupérer",
"version" : "2023-11-20",
"comentaires" : {
"0" : "non prise en compte de l'attribut dates",
"1" : "non prise en compte de l'attrbut relatedIdentifiers"
},
"path-and-fields" : {
"attributes" : {
"titles" : {
"type" : "list of dict",
"past_first_occ" : "title"
},
"publisher" : "string",
"publicationYear" : "int",
"subjects" : {
"past_values_w_this_key" : "subject",
"flatten_all_in_this_key" : "subject_raw"
},
"language" : "string",
"types" : {
"type" : "dict",
"past_values_w_this_key" : "resourceTypeGeneral"
},
"sizes" : "string",
"formats" : "string",
"rightsList" : {
"type" : "list of dict",
"past_values_w_this_key" :"rights"
},
"descriptions" : {
"type" : "list of dict",
"past_first_occ" : "description"
},
"geoLocations" : {
"flatten_all_in_this_key" : "geoLocations_raw"
},
"FundingReferences" : {
"flatten_all_in_this_key" : "FundingReferences_raw"
},
"source" : "string",
"isActive" : "string",
"state" : "string",
"viewCount" : "int",
"downloadCount" : "int",
"referenceCount" : "int",
"citationCount" : "int",
"versionCount" : "int",
"created" : "string",
"registered" : "string"
},
"relationships" : {
"client" : {
"go_to_sub_key" : "data",
"get_key" : "id"
},
"provider" : {
"go_to_sub_key" : "data",
"get_key" : "id"
}
}
}
}
import requests, json
def get_md_from_datacite( doi ) :
"""
retrieve data research metadata from datacite
"""
req = requests.get( f"https://api.datacite.org/dois/{doi}" )
try :
res = req.json()
except :
return "error"
if "errors" in res :
return "error"
return res
def parse_value_following_instruction(key, instruction, datacite_content) :
"""
permet de récuéprer les données reçues depuis datacite avec leur propre sturcturation selon les instructions précisées manuellement
key : la clé à traiter
instruction : les instructions à appliquer pour récupérer la valeur de la clé
datacite_content : le contenu de datacite à la clé précisé
la recherche des instructions est effectuée notamment depuis leur type (string ou dict) : à revoir pour plus de cohérence
"""
buffer = {} ## pour récupérer les données avec un dictionnaire
## quand les données à récupérer sont des objets simples, on récupère simplement la valeur
if instruction == "string" or instruction == "int" :
return {key: datacite_content}
## qaund les instructions sont fomratées en dict
if isinstance(instruction, dict) :
## si past_values_w_this_key est dans les instructions
if "past_values_w_this_key" in instruction :
## pour débbugger
##print("attribute is", key)
## quand les données sont directement sous forme de dict (eg. datatype)
if instruction["past_values_w_this_key"] in datacite_content :
temp_key_to_get = instruction["past_values_w_this_key"]
buffer.update(
{ instruction["past_values_w_this_key"] : datacite_content[temp_key_to_get] }
)
## quand les données sont des listes et qu'il faut itérer dessus
else :
all_vals = []
temp_key_to_get = instruction["past_values_w_this_key"]
[all_vals.append( item[ temp_key_to_get ]) for item in datacite_content]
buffer.update(
{temp_key_to_get : ",".join(all_vals)}
)
## quand il faut sortir toutes les données brutes
if "flatten_all_in_this_key" in instruction :
buffer.update(
{instruction["flatten_all_in_this_key"] : str(datacite_content) }
)
## if past_first_occ in instruction
if "past_first_occ" in instruction :
temp_key_to_get = instruction["past_first_occ"]
buffer.update(
{temp_key_to_get: datacite_content[0][temp_key_to_get]}
)
## if go_to_sub_key in the instruction
if "go_to_sub_key" in instruction :
temp_parent_key_to_get = instruction["go_to_sub_key"]
## identifier les clés enfantes à retrouver dans dataCite
temp_child_key_to_get = instruction["get_key"]
buffer.update(
{key : datacite_content[temp_parent_key_to_get][temp_child_key_to_get] }
)
else :
buffer.update(
{key : "to_do"}
)
return buffer
def parse_datacite_md(raw_datacite_mds):
"""
from json file load instruction
from DOI get datacite cotent
iterate on all datacite attributes
if data from datacite is needed get it with parse_value_following_instruction()
iterate on all datacite relationship
if data from datacite is needed get it with parse_value_following_instruction()
"""
doi_md = {
"doi" : raw_datacite_mds["data"]["id"]
}
## ____0____ from json file load instructions
with open("datacite-parser-instructions.json") as parser_file :
datacite_parser = json.load(parser_file)
## liste de tous les attributs à récupérer
attributes_to_get = datacite_parser["path-and-fields"]["attributes"].keys()
relations_to_get = datacite_parser["path-and-fields"]["relationships"].keys()
## ____1___ iterate on datacite attributes
for attribute_key in raw_datacite_mds["data"]["attributes"] :
attribute_value = raw_datacite_mds["data"]["attributes"][attribute_key]
## ne pas prendre les valeurs si elles sont nulles (sauf pour les nb)
if not isinstance(attribute_value, int) and not attribute_value :
# pour suivi print(f"{attribute_key} is empty")
continue
## si l'attribut fait parti de ceux à récupérer
if attribute_key in attributes_to_get :
## redistribuer le nom de la clé et sa valeur
value_to_add = parse_value_following_instruction(
attribute_key,
datacite_parser["path-and-fields"]["attributes"][attribute_key],
attribute_value)
doi_md.update(value_to_add)
## ____2___ iterate on datacite relations
### nb : on pourrait alléger en regroupant attribut et relationships
for relation_key in raw_datacite_mds["data"]["relationships"] :
relation_value = raw_datacite_mds["data"]["relationships"][relation_key]
## ne pas prendre les valeurs si elles sont nulles (sauf pour les nb)
if not isinstance(relation_key, int) and not relation_value :
continue
## si la relation est préciser dans les instructions
if relation_key in relations_to_get :
## to debug print("relation is", relation_key)
relation_to_add = parse_value_following_instruction(
relation_key,
datacite_parser["path-and-fields"]["relationships"][relation_key],
relation_value
)
doi_md.update(relation_to_add)
return doi_md
def from_repos_load_dois(repo_list) :
"""
from repo name (eg Zenodo) load file
"""
dois_raw = []
## load all dois
for repo_name in repo_list :
dois_raw += load_dois_from_file(repo_name)
return dois_raw
def load_dois_from_file(repo_name) :
"""
charge les dois des fichiers texts
file name syntax "entrepot-dois.txt"
"""
file_name = repo_name + "-dois.txt"
folder_path = "../0-collect-data/"
dois_in_repo = []
with open(folder_path + file_name) as f:
## attention à la fin des lignes il y a un retour à la ligne, d'où le (line)-1
## on passer les DOI en miniscule également
[dois_in_repo.append( line[ :len(line)-1].lower()) for line in f.readlines()]
print(f"{repo_name} DOIs \t\t {len(dois_in_repo)}")
return dois_in_repo
2-produce-graph/hist--datasets-by-year.png

25.6 KiB

import pandas as pd, matplotlib, matplotlib.pyplot as plt
import z_my_functions as my_fct
df = my_fct.load_and_treat_csv()
print(df.columns)
print("nb de datasets\t", len(df))
## publicationYear contient une année de 1995 : on reconstruit un champs année à partir de "created" : date d'enregistrement du dépot
df["year"] = df["created"].str[:4]
#print( df["year"].value_counts() )
df_hist = pd.DataFrame( df.groupby(["year"])[["doi"]].agg(["count"])).reset_index()
## rename columns
df_hist.columns = ["year", "nb_dois"]
#print("\n", df_hist)
## ____N____ do graph
fig, (ax) = plt.subplots(figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k')
ax.bar(df_hist.year, df_hist.nb_dois , align='center', alpha = 1.0, color='#7e96c4', ecolor='black', label="tba")
plt.xticks(df_hist.year.values)
## remove axi
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylabel("number of datasets", labelpad = 10)
plt.title(f"Number of datasets by year", fontsize = 22, x = 0.5, y = 1.03, alpha = 0.6)
plt.suptitle(f"n = {len(df)}", fontsize = 12, x = 0.5, y = 0.8, alpha = 0.6)
plt.savefig("hist--datasets-by-year.png")
2-produce-graph/pie--datacite-client.png

30 KiB

import pandas as pd, matplotlib, matplotlib.pyplot as plt
import z_my_functions as my_fct
import seaborn as sns
df = my_fct.load_and_treat_csv()
print(df.columns)
df_client_raw = df["client"].value_counts()
## regroup small values in "other"
treshold = 8
df_client = df_client_raw[df_client_raw > treshold]
## remove point in client name (eg. zenodo.cern)
clients_name = [item[: item.find(".")] for item in df_client.index]
df_client["other"] = df_client_raw[df_client_raw <= treshold].sum()
clients_name.append("other")
#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:len(df_client)]
plt.pie(df_client, labels = clients_name, colors = colors, autopct='%.0f%%')
plt.title(f"Distribution of datasets by DataCite client", fontsize = 20, x = 0.5, y = 1.03, alpha = 0.6)
plt.savefig("pie--datacite-client.png")
# print(len(df))
\ No newline at end of file
import pandas as pd
def load_and_treat_csv() :
df_raw = pd.read_csv("../dois-uga.csv", index_col=False)
## remove datacite type that are not "research data"
type_to_explude = ["Book", "ConferencePaper", "JournalArticle", "BookChapter", "Service", "Preprint"]
df = df_raw[ ~df_raw["resourceTypeGeneral"].isin(type_to_explude) ].copy()
return df
\ No newline at end of file
# UGA Open Research Data monitor # UGA Open Research Data monitor
> let's describe _open_ research data produced in by Grenoble Alpes University ! Let's describe _open_ research data produced in by Grenoble Alpes University !
_in process_
## Example
![](2-produce-graph\hist--datasets-by-year.png)
![](2-produce-graph\pie--datacite-client.png)
## Sources ## Sources
......
This diff is collapsed.
import subprocess import subprocess
def execute_python_file(fila_name): def execute_python_file(fila_name):
""" """
excute a py program excute a py program
...@@ -20,4 +19,4 @@ file_names = [ ...@@ -20,4 +19,4 @@ file_names = [
"rdg.py" "rdg.py"
] ]
execute_python_file(file_names[1]) execute_python_file(file_names[1])
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment