Compare revisions

97f4ff1a · f12311bd · 9e4a8762 · c60fad48 · f8b95dfa · 5c336f68
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,13 +25,14 @@ actualisation_dois:
    - git config user.name "${GITLAB_USER_NAME}"
    - git config user.email "${GITLAB_USER_EMAIL}"
    - git remote set-url --push origin "https://PUSH_TOKEN:${ACCESS_TOKEN}@gricad-gitlab.univ-grenoble-alpes.fr/${CI_PROJECT_PATH}.git"
-    - git add -f dois-uga.csv 2-produce-graph/hist-evol-datasets-per-repo.png 2-produce-graph/hist-quantity-year-type.png 2-produce-graph/pie--datacite-client.png 2-produce-graph/pie--datacite-type.png 2-produce-graph/hist-last-datasets-by-client.png 1-enrich-with-datacite/all_datacite_clients_for_uga.csv 1-enrich-with-datacite/nb-dois.txt
+    - git add -f dois-uga.csv dois-uga--last-500.csv 2-produce-graph/hist-evol-datasets-per-repo.png 2-produce-graph/hist-quantity-year-type.png 2-produce-graph/pie--datacite-client.png 2-produce-graph/pie--datacite-type.png 2-produce-graph/hist-last-datasets-by-client.png 1-enrich-with-datacite/all_datacite_clients_for_uga.csv 1-enrich-with-datacite/nb-dois.txt
    - git commit -m "Execution du pipeline. Actualisation des dois et des graphes."
    - git push origin HEAD:${CI_COMMIT_REF_NAME}

    # création d'un espace accueillant le clone du repo du site web, et tests au cas où l'espace existe déjà
-    - if ! [ -d "../cloned_repo" ]; then mkdir ../cloned_repo; fi
-    - if [ -d "../cloned_repo/${PATH_TO_PUSH}" ]; then cd ../cloned_repo/${PATH_TO_PUSH}; git pull; else cd ../cloned_repo; git clone ${LINK_TO_CLONE}; fi
+    - cd ..
+    - if ! [ -d "cloned_repo" ]; then mkdir cloned_repo; fi
+    - if [ -d "cloned_repo/${PATH_TO_PUSH}" ]; then cd cloned_repo/${PATH_TO_PUSH}; git pull; else cd cloned_repo; git clone ${LINK_TO_CLONE}; fi
    - cd -
    # copier le fichier "nb-dois.txt" pour commit dans le repo du site web
    - cp open-research-data-monitor-back/1-enrich-with-datacite/nb-dois.txt cloned_repo/${PATH_TO_PUSH}
@@ -48,6 +49,7 @@ actualisation_dois:
    # ajout des fichiers du dépôt qui ont été modifiés, au cas où un problème serait survenu dans "after_script"
    paths:
      - dois-uga.csv
+      - dois-uga--last-500.csv
      - 2-produce-graph/hist-evol-datasets-per-repo.png
      - 2-produce-graph/hist-quantity-year-type.png
      - 2-produce-graph/pie--datacite-client.png

--- a/0-collect-data/datacite-dois.txt
+++ b/0-collect-data/datacite-dois.txt
--- a/0-collect-data/datacite.py
+++ b/0-collect-data/datacite.py
@@ -125,19 +125,6 @@ if main_no_dois :
    print("datasets with an other identifier than DOI has been finded")
    [print(f"\t\t{elem}") for elem in main_no_dois]

-## __________n__________ remove DOIs who are newer version via Figshare repository
-## example 10.6084/m9.figshare.23737431.v2
-## peut être refait lors de la récupération en véridiant que relatedIdentifiers/relationType:"IsIdenticalTo" ne soit présent
-doi_to_remove = []
-for doi in main_dois : 
-    if "figshare" in doi :
-        # synthaxe : remove the number of the version
-        doi_shorten = doi[: len(doi) - 1]
-        if doi_shorten.endswith(".v") : 
-            doi_to_remove.append(doi)
-
-[main_dois.remove(elem) for elem in doi_to_remove]
-
 ## __________n__________ remove duplicates
 unique_dois = list(set(main_dois))
 print(f"\tNb of unique DOI\t{len(unique_dois)}")

--- a/0-collect-data/nakala-uga-users.txt
+++ b/0-collect-data/nakala-uga-users.txt
@@ -17,4 +17,18 @@ mbeligne
 acarbonnelle
 annegf
 tleduc
-abey
\ No newline at end of file
+abey
+mbarletta
+lmaritaud
+jbeaureder
+kboczon
+llacoste
+fcorsi
+ecarlier
+lvanbogaert
+nrousselot
+jlevy1
+mflecheux
+pbai
+ymonnier
+slecuyerchardevel
\ No newline at end of file
--- a/0-collect-data/rdg.py
+++ b/0-collect-data/rdg.py
@@ -21,16 +21,16 @@ urls = [
    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3AUGA',
    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3AUGA',
    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3AUGA',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=datasetContactAffiliation%3AGrenoble',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3AGrenoble',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3AGrenoble',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3AGrenoble'
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=datasetContactAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3A(Grenoble AND Alpes)'
    # possiblilité d'ajouter d'autres requêtes
 ]

 # on définit une fonction pour la lancer la requete avec chaque url pour les différentes affiliations
 def get_results(url):
-    req = requests.get(url)
+    req = requests.get(url+"&type=dataset")
    #print(req.url)
    results = [req.json()]
    
@@ -39,7 +39,7 @@ def get_results(url):
    count = nb_res
    page = 1
    while(nb_res > 0):
-        newurl = url+"&start="+str(count)
+        newurl = url+"&type=dataset"+"&start="+str(count)
        req = requests.get(newurl)
        results.append(req.json())
        nb_res = results[page]["data"]["count_in_response"]
@@ -59,7 +59,7 @@ def get_dois(results):
        nb_dois += len(num_dois)

        for item in num_dois :
-            dois.append(item["global_id"])
+            dois.append(item.get("global_id"))
        
    print("\tnb DOIs\t\t" + str(nb_dois))
    return dois

--- a/0-collect-data/zenodo-dois.txt
+++ b/0-collect-data/zenodo-dois.txt
--- a/0-collect-data/zenodo.py
+++ b/0-collect-data/zenodo.py
@@ -106,12 +106,20 @@ def req_zenodo(uga_perimeter, record_type) :

 all_dois = set() # a set to gather all DOIs

-uga_perimeter = "creators.affiliation:*grenoble* contributors.affiliation:*grenoble*"
+uga_txt_query = "(\"grenoble alpes\" OR \"grenoble alps\" OR \"grenoble INP\" \
+OR \"polytechnique de grenoble\" OR \"Grenoble Institute of Technology\" OR \"univeristé de grenoble\" )"
+
+uga_query = f"creators.affiliation:{uga_txt_query} contributors.affiliation:{uga_txt_query}"
+
+## memo 2024-02 two fields following by space will search in first field or in second field
+## ne pas faire de recherche avec AND car ça recherche dans toutes les affiliations des auteurs
+## SceincesPo Grenoble n'apporte pas de résultat https://zenodo.org/search?q=creators.affiliation%3A%28sciencespo%20AND%20Grenoble%29&q=type%3Adataset&l=list&p=1&s=10&sort=bestmatch
+
 types_to_req = ["dataset", "image", "video", "software", "other"]

 for record_type in types_to_req : 

-    temp_dois = req_zenodo(uga_perimeter, record_type)
+    temp_dois = req_zenodo(uga_query, record_type)

    ## placer les DOI dans le buffer général qui ne peut contenir que des valeurs uniques
    [all_dois.add(doi) for doi in temp_dois]

--- a/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
+++ b/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
 client,count,name,year,url
-cern.zenodo,994,Zenodo,2013,https://zenodo.org/
-inist.sshade,469,Solid Spectroscopy Hosting Architecture of Databases and Expertise,2019,https://www.sshade.eu/
-inist.osug,238,Observatoire des Sciences de l'Univers de Grenoble,2014,http://doi.osug.fr
-figshare.ars,228,figshare Academic Research System,2016,http://figshare.com/
-dryad.dryad,157,DRYAD,2018,https://datadryad.org
-inist.resif,78,Réseau sismologique et géodésique français,2014,https://www.resif.fr/
-inist.persyval,55,PERSYVAL-Lab : Pervasive Systems and Algorithms Lab,2016,
-rdg.prod,49,Recherche Data Gouv France,2022,https://recherche.data.gouv.fr/en
-inist.humanum,34,Huma-Num,2020,https://nakala.fr
-figshare.sage,16,figshare SAGE Publications,2018,
-mcdy.dohrmi,12,dggv-e-publications,2020,https://www.dggv.de/publikationen/dggv-e-publikationen.html
-uqtr.mesxqq,7,Collection numérique (UQTR),2023,https://collection-numerique.uqtr.ca/
-gfz.iugg2023,6,IUGG 2023,2022,https://gfzpublic.gfz-potsdam.de
-rg.rg,4,ResearchGate,2016,https://www.researchgate.net/search/data
-iris.iris,3,Incorporated Research Institutions for Seismology,2018,http://www.iris.edu/hq/
+cern.zenodo,905,Zenodo,2013,https://zenodo.org/
+inist.sshade,530,Solid Spectroscopy Hosting Architecture of Databases and Expertise,2019,https://www.sshade.eu/
+figshare.ars,380,figshare Academic Research System,2016,http://figshare.com/
+inist.osug,275,Observatoire des Sciences de l'Univers de Grenoble,2014,http://doi.osug.fr
+dryad.dryad,168,DRYAD,2018,https://datadryad.org
+inist.resif,101,Réseau sismologique et géodésique français,2014,https://www.resif.fr/
+rdg.prod,85,Recherche Data Gouv France,2022,https://recherche.data.gouv.fr/en
+inist.humanum,75,NAKALA,2020,https://nakala.fr
+inist.persyval,64,PERSYVAL-Lab : Pervasive Systems and Algorithms Lab,2016,
+fmsh.prod,28,Fondation Maison des sciences de l'homme,2023,
+inist.ccj,22,Centre Camille Jullian – UMR 7299,2020,
+pangaea.repository,18,PANGAEA,2020,https://www.pangaea.de/
+mcdy.dohrmi,14,dggv-e-publications,2020,https://www.dggv.de/publikationen/dggv-e-publikationen.html
+inist.cirm,7,Centre International de Rencontres Mathématiques,2017,
+figshare.sage,6,figshare SAGE Publications,2018,
+iris.iris,5,NSF Seismological Facility for the Advancement of Geoscience (SAGE),2018,http://www.iris.edu/hq/
+tib.repod,4,RepOD,2015,https://repod.icm.edu.pl/
 vqpf.dris,3,Direction des ressources et de l'information scientifique,2021,
 tib.gfzbib,3,GFZpublic,2011,https://gfzpublic.gfz-potsdam.de
-inist.epure,2,Éditions et presses universitaires de Reims,2020,
+cnic.sciencedb,3,ScienceDB,2022,https://www.scidb.cn/en
+inist.eost,2,Ecole et Observatoire des Sciences de la Terre,2017,https://eost.unistra.fr/en/
+tib.gfz,2,GFZ Data Services,2011,https://dataservices.gfz-potsdam.de/portal/
+bl.mendeley,2,Mendeley Data,2015,https://data.mendeley.com/
 bl.nerc,2,NERC Environmental Data Service,2011,https://eds.ukri.org
-jbru.eso,2,Espaces et Sociétés,2021,
-tib.repod,2,RepOD,2015,
+tug.openlib,2,TU Graz OPEN Library,2020,https://openlib.tugraz.at/
+crui.ingv,2,Istituto Nazionale di Geofisica e Vulcanologia (INGV),2013,http://data.ingv.it/
 ugraz.unipub,2,unipub,2019,http://unipub.uni-graz.at
-crui.ingv,1,Istituto Nazionale di Geofisica e Vulcanologia (INGV),2013,http://data.ingv.it/
+ethz.sed,2,"Swiss Seismological Service, national earthquake monitoring and hazard center",2013,http://www.seismo.ethz.ch
+inist.opgc,1,Observatoire de Physique du Globe de Clermont-Ferrand,2017,
+ethz.da-rd,1,ETHZ Data Archive - Research Data,2013,http://data-archive.ethz.ch
+ethz.zora,1,"Universität Zürich, ZORA",2013,https://www.zora.uzh.ch/
+estdoi.ttu,1,TalTech,2019,https://digikogu.taltech.ee
+repod.dbuw,1,University of Warsaw Research Data Repository,2023,https://danebadawcze.uw.edu.pl/
+inist.ird,1,IRD,2016,
 inist.omp,1,Observatoire Midi-Pyrénées,2011,
-tib.gfz,1,GFZ Data Services,2011,https://dataservices.gfz-potsdam.de/portal/
+umass.uma,1,University of Massachusetts (UMass) Amherst,2018,https://scholarworks.umass.edu/
 edi.edi,1,Environmental Data Initiative,2017,https://portal.edirepository.org/nis/home.jsp
-inist.mshsud,1,NumeRev,2019,https://www.projet.numerev.com/
-jbru.idees,1,"IDEES : Identité et Différenciation de l'Espace, de l'Environnement et des Sociétés",2021,http://umr-idees.fr/
-inist.opgc,1,Observatoire de Physique du Globe de Clermont-Ferrand,2017,
 bl.iita,1,International Institute of Tropical Agriculture datasets,2017,http://data.iita.org/
 ardcx.nci,1,National Computational Infrastructure,2020,
-umass.uma,1,University of Massachusetts (UMass) Amherst,2018,https://scholarworks.umass.edu/
-bl.mendeley,1,Mendeley Data,2015,https://data.mendeley.com/
-ethz.zora,1,"Universität Zürich, ZORA",2013,https://www.zora.uzh.ch/
-psnc.uwr,1,Uniwersytet Wrocławski,2019,
-inist.utc,1,Université de technologie de Compiègne,2019,https://www.utc.fr/
-inist.ird,1,IRD,2016,
-inist.eost,1,Ecole et Observatoire des Sciences de la Terre,2017,https://eost.unistra.fr/en/
 ihumi.pub,1,IHU Méditerranée Infection,2020,
-arxiv.content,1,arXiv,2021,
-estdoi.ttu,1,TalTech,2019,https://digikogu.taltech.ee
-tug.openlib,1,TU Graz OPEN Library,2020,https://openlib.tugraz.at/
+inist.inrap,1,Institut national de recherches archéologiques préventives,2019,
+tib.mpdl,1,Max Planck Digital Library,2015,
+tudublin.arrow,1,ARROW@TU Dublin,2020,https://arrow.dit.ie/
--- a/1-enrich-with-datacite/concatenate-enrich-dois.py
+++ b/1-enrich-with-datacite/concatenate-enrich-dois.py
@@ -20,7 +20,7 @@ print("\tDOIs to treat\t\t", len(dois))

 ## pour essayer avec un seul DOI
 # temp_doi = dois[random.randint(0, len(dois))]
-# #temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509
+# temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509
 # print(temp_doi)
 # raw_metadatas = my_functions.get_md_from_datacite(temp_doi)

@@ -28,6 +28,7 @@ doi_error = [] # retrieve doi error
 temp_rows = [] # put data in dict before df

 df_old = pd.read_csv("../dois-uga.csv")
+
 print(f"\n\tnb of DOIs already treated\t{len(df_old)}")

 # req dataCite and paste data following instructions
@@ -55,21 +56,50 @@ for doi in dois : #[:300]
 ## if new datasets has been founded
 if temp_rows :
 	df_fresh = pd.DataFrame(temp_rows)
+	dois_added = list(df_old["doi"])
+	to_del = []
+	for i in range(0, len(df_fresh)):
+		result = my_functions.get_origin_version(df_fresh.loc[i, "doi"])
+		if result[0] not in dois_added: 
+			dois_added.append(result[0])
+			df_fresh.loc[i, "doi"] = result[0]
+			if str(result[1]) != "[]": df_fresh.loc[i, "traveled_dois"] = str(result[1])
+			else: df_fresh.loc[i, "traveled_dois"] = ""	
+			if str(result[2]) != "[]": df_fresh.loc[i, "all_relations"] = str(result[2])
+			else: df_fresh.loc[i, "all_relations"] = ""
+		else:
+			to_del.append(i)
+			
+	df_fresh.drop(to_del, inplace=True)
+	print("Nombre de dois supprimés : " + str(len(to_del)))
+	
+	print("Nb dois a garder : " + str(len(dois_added)))
+
 	df_concat = pd.concat([df_old, df_fresh], ignore_index=True)

-	## remove not wanted datacite type
+	## remove not wanted datacite type & clients
 	type_to_explude = ["Book", "ConferencePaper", "ConferenceProceeding", "JournalArticle", "BookChapter", "Service", "Preprint"]
-    df_out = df_concat[ ~df_concat["resourceTypeGeneral"].isin(type_to_explude) ].copy()
-    
-    ## output main CSV
+	clients_to_exclude = ["rg.rg", "inist.epure"]
+
+	df_out = df_concat[ ~df_concat["resourceTypeGeneral"].isin(type_to_explude) & ~df_concat["client"].isin(clients_to_exclude) ].copy()
+	
+	## output main CSV
 	df_out.to_csv("../dois-uga.csv", index = False)
 	print(f"\n\nnb of doi exported \t{len(df_out)}")

+
 	# write the number of dois found in a file to display on the website
 	with open("nb-dois.txt", 'w') as outf :
 		outf.write(str(len(df_out)))

-	## output another csv with datacite client and number of datasets
+
+	## output last 500 DOIs to make it easier to open in web tools
+	df_last_dois = df_out.sort_values(by = "created", ascending = False, inplace = False)[:500]
+	df_last_dois["created"] = df_last_dois["created"].str[:10]
+	df_last_dois[["doi", "client", "resourceTypeGeneral", "created", "publisher", "rights", "sizes"]].to_csv("../dois-uga--last-500.csv", index = False)
+
+
+	## for the website : output another csv with datacite client and number of datasets
 	df_client_raw = df_out["client"].value_counts().to_frame()

 	## get informations about each client

--- a/1-enrich-with-datacite/nb-dois.txt
+++ b/1-enrich-with-datacite/nb-dois.txt
-2386
\ No newline at end of file
+2727
\ No newline at end of file
--- a/1-enrich-with-datacite/z_personal_functions.py
+++ b/1-enrich-with-datacite/z_personal_functions.py
 import requests, json

+# Fonction pour éviter la redondance des données associées à des DOIs différents mais pointant vers les mêmes fichiers :
+# Dans Zenodo par exemple, il y a un DOI associé à chaque version d'un dépôt et il faut remonter au DOI "chapeau"
+# Si le DOI "chapeau" obtenu ou un "is_identical_to" fait référence à un DOI déjà existant dans le csv, il doit être ignoré.
+def get_origin_version(doi, history=[], first=True):
+    if first: history=[] # ligne ajoutée pour éviter certains soucis de cache où history n'est pas vide au premier appel de la fonction
+    req = requests.get( f"https://api.datacite.org/dois/{doi}" )
+    res = req.json()
+    final = []
+    result = (doi, history, final) # doi est le DOI qui sera ajouté au csv, history retrace les dois et les relations ayant permis les recherches et final enregistre les relations du doi final ajouté au csv
+    try:
+        related = res["data"]["attributes"]["relatedIdentifiers"] # test si des relations existent pour le doi courant
+    except:
+        pass # si pas de relation, on renvoie le doi courant
+    else:
+        ignore = False # ignore correspond à un doi ayant une version "chapeau" qui doit être trouvée. Le doi courant doit donc être ignoré
+        duplicate = False # duplicate correspond à un doi étant identique à un autre
+        for i in related:
+            final.append(i.get("relationType"))
+            if i.get("relationType") == "IsVersionOf" and i.get("relatedIdentifierType") == "DOI": 
+                ignore = True
+                elem_to_save_i = i.get("relatedIdentifier")
+                history.append([i.get("relationType"), i.get("relatedIdentifier")])
+            if i.get("relationType") == "IsIdenticalTo" and i.get("relatedIdentifierType") == "DOI":
+                duplicate = True
+                elem_to_save_d = i.get("relatedIdentifier") # pas de symétrie pour les is_identical_to, donc il suffit de prendre l'autre (pas le doi courant) pour éviter les doublons
+                history.append([i.get("relationType"), i.get("relatedIdentifier")])
+        if duplicate and not(ignore):
+            result = (elem_to_save_d, history, final) # si identique mais pas de version chapeau on peut s'arrêter
+        if ignore: result = get_origin_version(elem_to_save_i, history, False) # si version chapeau, on avance sans regarder les identiques
+    return result
+
 def get_md_from_datacite( doi ) : 
    """
 	retrieve data research metadata from datacite

--- a/2-produce-graph/hist-evol-datasets-per-repo.png
+++ b/2-produce-graph/hist-evol-datasets-per-repo.png
--- a/2-produce-graph/hist-evol-datasets-per-repo.py
+++ b/2-produce-graph/hist-evol-datasets-per-repo.py
@@ -94,8 +94,7 @@ for i, date in enumerate(df_evol.index) :
 ax.set_xticks(x_idx_toshow)
 ax.set_xticklabels(x_label_toshow, rotation=70, fontsize=10)

-
-plt.title(f"Evolution of the quantity of UGA open datasets\n and distribution per repository", \
+plt.title(f"Cumulative view of the quantity of UGA research data\n and distribution by repository", \
 	fontsize = 18, x = 0.5, y = 1.03, alpha = 0.8)
 plt.suptitle(f"n = {len(df)}", fontsize = 12, x = 0.5, y = 0.87, alpha = 0.6)


--- a/2-produce-graph/hist-last-datasets-by-client.png
+++ b/2-produce-graph/hist-last-datasets-by-client.png
--- a/2-produce-graph/hist-last-datasets-by-client.py
+++ b/2-produce-graph/hist-last-datasets-by-client.py
@@ -53,7 +53,7 @@ ax.tick_params(axis='both', which='major', labelsize=8)

 plt.legend(reverse = False)

-plt.title(f"Datasets registered over the last 30 days\ndistributed by DataCite client",
+plt.title(f"Research data registered over the last 30 days\ndistributed by DataCite client",
 	fontsize = 18, x = 0.5, y = 1.03, alpha = 0.8)
 plt.suptitle(f"n = {len(df_last_weeks)}", fontsize = 12, x = 0.5, y = 0.89, alpha = 0.6)


--- a/2-produce-graph/hist-quantity-year-type.png
+++ b/2-produce-graph/hist-quantity-year-type.png
--- a/2-produce-graph/hist-quantity-year-type.py
+++ b/2-produce-graph/hist-quantity-year-type.py
--- a/2-produce-graph/pie--datacite-client.png
+++ b/2-produce-graph/pie--datacite-client.png
--- a/2-produce-graph/pie--datacite-type.png
+++ b/2-produce-graph/pie--datacite-type.png
--- a/2-produce-graph/pie-data-type.py
+++ b/2-produce-graph/pie-data-type.py
No results found