Compare revisions

e7cfa0e4 · 30ce4eed · 42844804 · cf455742 · 3e6eb14f · 97f4ff1a
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,16 +25,14 @@ actualisation_dois:
    - git config user.name "${GITLAB_USER_NAME}"
    - git config user.email "${GITLAB_USER_EMAIL}"
    - git remote set-url --push origin "https://PUSH_TOKEN:${ACCESS_TOKEN}@gricad-gitlab.univ-grenoble-alpes.fr/${CI_PROJECT_PATH}.git"
-    - git add -f dois-uga.csv 2-produce-graph/hist-evol-datasets-per-repo.png 2-produce-graph/hist-quantity-year-type.png 2-produce-graph/pie--datacite-client.png 2-produce-graph/pie--datacite-type.png 2-produce-graph/hist-last-datasets-by-client.png 1-enrich-with-datacite/all_datacite_clients_for_uga.csv 1-enrich-with-datacite/nb-dois.txt
+    - git add -f dois-uga.csv dois-uga--last-500.csv 2-produce-graph/hist-evol-datasets-per-repo.png 2-produce-graph/hist-quantity-year-type.png 2-produce-graph/pie--datacite-client.png 2-produce-graph/pie--datacite-type.png 2-produce-graph/hist-last-datasets-by-client.png 1-enrich-with-datacite/all_datacite_clients_for_uga.csv 1-enrich-with-datacite/nb-dois.txt
    - git commit -m "Execution du pipeline. Actualisation des dois et des graphes."
    - git push origin HEAD:${CI_COMMIT_REF_NAME}

-    # création d'un espace accueillant le clone du repo du site web
+    # création d'un espace accueillant le clone du repo du site web, et tests au cas où l'espace existe déjà
    - cd ..
-    - mkdir cloned_repo 2>&1
-    - cd cloned_repo
-    - git clone ${LINK_TO_CLONE} 2>&1
-    - git pull
+    - if ! [ -d "cloned_repo" ]; then mkdir cloned_repo; fi
+    - if [ -d "cloned_repo/${PATH_TO_PUSH}" ]; then cd cloned_repo/${PATH_TO_PUSH}; git pull; else cd cloned_repo; git clone ${LINK_TO_CLONE}; fi
    - cd -
    # copier le fichier "nb-dois.txt" pour commit dans le repo du site web
    - cp open-research-data-monitor-back/1-enrich-with-datacite/nb-dois.txt cloned_repo/${PATH_TO_PUSH}
@@ -51,6 +49,7 @@ actualisation_dois:
    # ajout des fichiers du dépôt qui ont été modifiés, au cas où un problème serait survenu dans "after_script"
    paths:
      - dois-uga.csv
+      - dois-uga--last-500.csv
      - 2-produce-graph/hist-evol-datasets-per-repo.png
      - 2-produce-graph/hist-quantity-year-type.png
      - 2-produce-graph/pie--datacite-client.png

--- a/0-collect-data/datacite-dois.txt
+++ b/0-collect-data/datacite-dois.txt
--- a/0-collect-data/datacite.py
+++ b/0-collect-data/datacite.py
@@ -12,8 +12,8 @@ https://support.datacite.org/docs/can-i-see-more-detailed-affiliation-informatio

 * mémo : pour descendre au niveau des auteurs , le filtre `?person-id=orcid-nb`

-* nb 2024-01-13
-retrait de l'AAU (jbru.aau) car tout est du PD de congrès
+* note 2023-02
+retrait de l'AAU (jbru.aau) car tout est du PDF de congrès
 ajout de client.uid:inist.sshade et client.uid:inist.resif
 """

@@ -125,19 +125,6 @@ if main_no_dois :
    print("datasets with an other identifier than DOI has been finded")
    [print(f"\t\t{elem}") for elem in main_no_dois]

-## __________n__________ remove DOIs who are newer version via Figshare repository
-## example 10.6084/m9.figshare.23737431.v2
-## peut être refait lors de la récupération en véridiant que relatedIdentifiers/relationType:"IsIdenticalTo" ne soit présent
-doi_to_remove = []
-for doi in main_dois : 
-    if "figshare" in doi :
-        # synthaxe : remove the number of the version
-        doi_shorten = doi[: len(doi) - 1]
-        if doi_shorten.endswith(".v") : 
-            doi_to_remove.append(doi)
-
-[main_dois.remove(elem) for elem in doi_to_remove]
-
 ## __________n__________ remove duplicates
 unique_dois = list(set(main_dois))
 print(f"\tNb of unique DOI\t{len(unique_dois)}")

--- a/0-collect-data/nakala-uga-users.txt
+++ b/0-collect-data/nakala-uga-users.txt
@@ -17,4 +17,18 @@ mbeligne
 acarbonnelle
 annegf
 tleduc
-abey
\ No newline at end of file
+abey
+mbarletta
+lmaritaud
+jbeaureder
+kboczon
+llacoste
+fcorsi
+ecarlier
+lvanbogaert
+nrousselot
+jlevy1
+mflecheux
+pbai
+ymonnier
+slecuyerchardevel
\ No newline at end of file
--- a/0-collect-data/rdg.py
+++ b/0-collect-data/rdg.py
@@ -21,16 +21,16 @@ urls = [
    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3AUGA',
    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3AUGA',
    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3AUGA',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=datasetContactAffiliation%3AGrenoble',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3AGrenoble',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3AGrenoble',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3AGrenoble'
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=datasetContactAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3A(Grenoble AND Alpes)'
    # possiblilité d'ajouter d'autres requêtes
 ]

 # on définit une fonction pour la lancer la requete avec chaque url pour les différentes affiliations
 def get_results(url):
-    req = requests.get(url)
+    req = requests.get(url+"&type=dataset")
    #print(req.url)
    results = [req.json()]
    
@@ -39,7 +39,7 @@ def get_results(url):
    count = nb_res
    page = 1
    while(nb_res > 0):
-        newurl = url+"&start="+str(count)
+        newurl = url+"&type=dataset"+"&start="+str(count)
        req = requests.get(newurl)
        results.append(req.json())
        nb_res = results[page]["data"]["count_in_response"]
@@ -59,7 +59,7 @@ def get_dois(results):
        nb_dois += len(num_dois)

        for item in num_dois :
-            dois.append(item["global_id"])
+            dois.append(item.get("global_id"))
        
    print("\tnb DOIs\t\t" + str(nb_dois))
    return dois

--- a/0-collect-data/zenodo-dois.txt
+++ b/0-collect-data/zenodo-dois.txt
--- a/0-collect-data/zenodo.py
+++ b/0-collect-data/zenodo.py
@@ -5,7 +5,6 @@
 """
 ## todo
 - v2 : rechercher UGA comme financeur `grants.funder.doi`
- v2 : passer par les ORCID des `creator.orcid` et `contributors.orcid`

 ## Documentation
 * Liste des métadonnées obligatoires lors du dépôts (upload_type, sub_type, publication_date, titre, creators, ) https://developers.zenodo.org/#representation
@@ -13,12 +12,9 @@
 * Doc dev API champs de requêtes classsiques https://developers.zenodo.org/#records
 * doc champs poussées pour la recherche https://help.zenodo.org/guides/search/
 * typologie des dépôts possiblent : publication: Publication, poster: Poster, presentation: Presentation, Dataset: Dataset, image: Image, video: Video/Audio, software: Software, lesson: Lesson, physicalobject: Physical object, other: Other
+* descendre au niveau des ORCID des `creator.orcid` et `contributors.orcid`


-### Identifier les dépôts : 
- utilisation du champs `creators.affiliation` et contributor.affiliation (multi affiliation)
- utilisation de la forme "grenoble" uniquement, possibilité de bruit
-
 ## Notes sur la récupération
 - exemple résultats de requete : https://zenodo.org/api/records?q=creators.affiliation%3A*grenoble*&type=dataset&page=6&size=100&sort=mostrecent&all_version=False
 - deux DOI identiques sont présents : un à la racine `[hits][doi]` et un autre dans `[hits][metadata][doi]`
@@ -110,12 +106,20 @@ def req_zenodo(uga_perimeter, record_type) :

 all_dois = set() # a set to gather all DOIs

-uga_perimeter = "creators.affiliation:*grenoble* contributors.affiliation:*grenoble*"
+uga_txt_query = "(\"grenoble alpes\" OR \"grenoble alps\" OR \"grenoble INP\" \
+OR \"polytechnique de grenoble\" OR \"Grenoble Institute of Technology\" OR \"univeristé de grenoble\" )"
+
+uga_query = f"creators.affiliation:{uga_txt_query} contributors.affiliation:{uga_txt_query}"
+
+## memo 2024-02 two fields following by space will search in first field or in second field
+## ne pas faire de recherche avec AND car ça recherche dans toutes les affiliations des auteurs
+## SceincesPo Grenoble n'apporte pas de résultat https://zenodo.org/search?q=creators.affiliation%3A%28sciencespo%20AND%20Grenoble%29&q=type%3Adataset&l=list&p=1&s=10&sort=bestmatch
+
 types_to_req = ["dataset", "image", "video", "software", "other"]

 for record_type in types_to_req : 

-    temp_dois = req_zenodo(uga_perimeter, record_type)
+    temp_dois = req_zenodo(uga_query, record_type)

    ## placer les DOI dans le buffer général qui ne peut contenir que des valeurs uniques
    [all_dois.add(doi) for doi in temp_dois]

--- a/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
+++ b/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
 client,count,name,year,url
-cern.zenodo,994,Zenodo,2013,https://zenodo.org/
-inist.sshade,469,Solid Spectroscopy Hosting Architecture of Databases and Expertise,2019,https://www.sshade.eu/
-inist.osug,238,Observatoire des Sciences de l'Univers de Grenoble,2014,http://doi.osug.fr
-figshare.ars,228,figshare Academic Research System,2016,http://figshare.com/
-dryad.dryad,157,DRYAD,2018,https://datadryad.org
-inist.resif,78,Réseau sismologique et géodésique français,2014,https://www.resif.fr/
-inist.persyval,55,PERSYVAL-Lab : Pervasive Systems and Algorithms Lab,2016,
-rdg.prod,49,Recherche Data Gouv France,2022,https://recherche.data.gouv.fr/en
-inist.humanum,34,Huma-Num,2020,https://nakala.fr
-figshare.sage,16,figshare SAGE Publications,2018,
-mcdy.dohrmi,12,dggv-e-publications,2020,https://www.dggv.de/publikationen/dggv-e-publikationen.html
-uqtr.mesxqq,7,Collection numérique (UQTR),2023,https://collection-numerique.uqtr.ca/
-gfz.iugg2023,6,IUGG 2023,2022,https://gfzpublic.gfz-potsdam.de
-rg.rg,4,ResearchGate,2016,https://www.researchgate.net/search/data
-iris.iris,3,Incorporated Research Institutions for Seismology,2018,http://www.iris.edu/hq/
+cern.zenodo,885,Zenodo,2013,https://zenodo.org/
+inist.sshade,522,Solid Spectroscopy Hosting Architecture of Databases and Expertise,2019,https://www.sshade.eu/
+figshare.ars,380,figshare Academic Research System,2016,http://figshare.com/
+inist.osug,275,Observatoire des Sciences de l'Univers de Grenoble,2014,http://doi.osug.fr
+dryad.dryad,168,DRYAD,2018,https://datadryad.org
+inist.resif,99,Réseau sismologique et géodésique français,2014,https://www.resif.fr/
+rdg.prod,81,Recherche Data Gouv France,2022,https://recherche.data.gouv.fr/en
+inist.humanum,75,NAKALA,2020,https://nakala.fr
+inist.persyval,64,PERSYVAL-Lab : Pervasive Systems and Algorithms Lab,2016,
+fmsh.prod,28,Fondation Maison des sciences de l'homme,2023,
+inist.ccj,22,Centre Camille Jullian – UMR 7299,2020,
+pangaea.repository,18,PANGAEA,2020,https://www.pangaea.de/
+mcdy.dohrmi,14,dggv-e-publications,2020,https://www.dggv.de/publikationen/dggv-e-publikationen.html
+inist.cirm,7,Centre International de Rencontres Mathématiques,2017,
+figshare.sage,6,figshare SAGE Publications,2018,
+iris.iris,5,NSF Seismological Facility for the Advancement of Geoscience (SAGE),2018,http://www.iris.edu/hq/
 vqpf.dris,3,Direction des ressources et de l'information scientifique,2021,
 tib.gfzbib,3,GFZpublic,2011,https://gfzpublic.gfz-potsdam.de
-inist.epure,2,Éditions et presses universitaires de Reims,2020,
+tib.repod,3,RepOD,2015,https://repod.icm.edu.pl/
+cnic.sciencedb,3,ScienceDB,2022,https://www.scidb.cn/en
+inist.eost,2,Ecole et Observatoire des Sciences de la Terre,2017,https://eost.unistra.fr/en/
+tib.gfz,2,GFZ Data Services,2011,https://dataservices.gfz-potsdam.de/portal/
+bl.mendeley,2,Mendeley Data,2015,https://data.mendeley.com/
 bl.nerc,2,NERC Environmental Data Service,2011,https://eds.ukri.org
-jbru.eso,2,Espaces et Sociétés,2021,
-tib.repod,2,RepOD,2015,
+tug.openlib,2,TU Graz OPEN Library,2020,https://openlib.tugraz.at/
+crui.ingv,2,Istituto Nazionale di Geofisica e Vulcanologia (INGV),2013,http://data.ingv.it/
 ugraz.unipub,2,unipub,2019,http://unipub.uni-graz.at
-crui.ingv,1,Istituto Nazionale di Geofisica e Vulcanologia (INGV),2013,http://data.ingv.it/
+ethz.sed,2,"Swiss Seismological Service, national earthquake monitoring and hazard center",2013,http://www.seismo.ethz.ch
+inist.opgc,1,Observatoire de Physique du Globe de Clermont-Ferrand,2017,
+ethz.da-rd,1,ETHZ Data Archive - Research Data,2013,http://data-archive.ethz.ch
+ethz.zora,1,"Universität Zürich, ZORA",2013,https://www.zora.uzh.ch/
+estdoi.ttu,1,TalTech,2019,https://digikogu.taltech.ee
+repod.dbuw,1,University of Warsaw Research Data Repository,2023,https://danebadawcze.uw.edu.pl/
+inist.ird,1,IRD,2016,
 inist.omp,1,Observatoire Midi-Pyrénées,2011,
-tib.gfz,1,GFZ Data Services,2011,https://dataservices.gfz-potsdam.de/portal/
+umass.uma,1,University of Massachusetts (UMass) Amherst,2018,https://scholarworks.umass.edu/
 edi.edi,1,Environmental Data Initiative,2017,https://portal.edirepository.org/nis/home.jsp
-inist.mshsud,1,NumeRev,2019,https://www.projet.numerev.com/
-jbru.idees,1,"IDEES : Identité et Différenciation de l'Espace, de l'Environnement et des Sociétés",2021,http://umr-idees.fr/
-inist.opgc,1,Observatoire de Physique du Globe de Clermont-Ferrand,2017,
 bl.iita,1,International Institute of Tropical Agriculture datasets,2017,http://data.iita.org/
 ardcx.nci,1,National Computational Infrastructure,2020,
-umass.uma,1,University of Massachusetts (UMass) Amherst,2018,https://scholarworks.umass.edu/
-bl.mendeley,1,Mendeley Data,2015,https://data.mendeley.com/
-ethz.zora,1,"Universität Zürich, ZORA",2013,https://www.zora.uzh.ch/
-psnc.uwr,1,Uniwersytet Wrocławski,2019,
-inist.utc,1,Université de technologie de Compiègne,2019,https://www.utc.fr/
-inist.ird,1,IRD,2016,
-inist.eost,1,Ecole et Observatoire des Sciences de la Terre,2017,https://eost.unistra.fr/en/
 ihumi.pub,1,IHU Méditerranée Infection,2020,
-arxiv.content,1,arXiv,2021,
-estdoi.ttu,1,TalTech,2019,https://digikogu.taltech.ee
-tug.openlib,1,TU Graz OPEN Library,2020,https://openlib.tugraz.at/
+inist.inrap,1,Institut national de recherches archéologiques préventives,2019,
+tib.mpdl,1,Max Planck Digital Library,2015,
+tudublin.arrow,1,ARROW@TU Dublin,2020,https://arrow.dit.ie/
--- a/1-enrich-with-datacite/concatenate-enrich-dois.py
+++ b/1-enrich-with-datacite/concatenate-enrich-dois.py
@@ -20,7 +20,7 @@ print("\tDOIs to treat\t\t", len(dois))

 ## pour essayer avec un seul DOI
 # temp_doi = dois[random.randint(0, len(dois))]
-# #temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509
+# temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509
 # print(temp_doi)
 # raw_metadatas = my_functions.get_md_from_datacite(temp_doi)

@@ -28,6 +28,7 @@ doi_error = [] # retrieve doi error
 temp_rows = [] # put data in dict before df

 df_old = pd.read_csv("../dois-uga.csv")
+
 print(f"\n\tnb of DOIs already treated\t{len(df_old)}")

 # req dataCite and paste data following instructions
@@ -55,15 +56,50 @@ for doi in dois : #[:300]
 ## if new datasets has been founded
 if temp_rows :
 	df_fresh = pd.DataFrame(temp_rows)
-	df_out = pd.concat([df_old, df_fresh], ignore_index=True)
+	dois_added = list(df_old["doi"])
+	to_del = []
+	for i in range(0, len(df_fresh)):
+		result = my_functions.get_origin_version(df_fresh.loc[i, "doi"])
+		if result[0] not in dois_added: 
+			dois_added.append(result[0])
+			df_fresh.loc[i, "doi"] = result[0]
+			if str(result[1]) != "[]": df_fresh.loc[i, "traveled_dois"] = str(result[1])
+			else: df_fresh.loc[i, "traveled_dois"] = ""	
+			if str(result[2]) != "[]": df_fresh.loc[i, "all_relations"] = str(result[2])
+			else: df_fresh.loc[i, "all_relations"] = ""
+		else:
+			to_del.append(i)
+			
+	df_fresh.drop(to_del, inplace=True)
+	print("Nombre de dois supprimés : " + str(len(to_del)))
+	
+	print("Nb dois a garder : " + str(len(dois_added)))
+
+	df_concat = pd.concat([df_old, df_fresh], ignore_index=True)
+
+	## remove not wanted datacite type & clients
+	type_to_explude = ["Book", "ConferencePaper", "ConferenceProceeding", "JournalArticle", "BookChapter", "Service", "Preprint"]
+	clients_to_exclude = ["rg.rg", "inist.epure"]
+
+	df_out = df_concat[ ~df_concat["resourceTypeGeneral"].isin(type_to_explude) & ~df_concat["client"].isin(clients_to_exclude) ].copy()
+	
+	## output main CSV
 	df_out.to_csv("../dois-uga.csv", index = False)
 	print(f"\n\nnb of doi exported \t{len(df_out)}")

+
 	# write the number of dois found in a file to display on the website
 	with open("nb-dois.txt", 'w') as outf :
 		outf.write(str(len(df_out)))

-	## output another csv with datacite client and number of datasets
+
+	## output last 500 DOIs to make it easier to open in web tools
+	df_last_dois = df_out.sort_values(by = "created", ascending = False, inplace = False)[:500]
+	df_last_dois["created"] = df_last_dois["created"].str[:10]
+	df_last_dois[["doi", "client", "resourceTypeGeneral", "created", "publisher", "rights", "sizes"]].to_csv("../dois-uga--last-500.csv", index = False)
+
+
+	## for the website : output another csv with datacite client and number of datasets
 	df_client_raw = df_out["client"].value_counts().to_frame()

 	## get informations about each client

--- a/1-enrich-with-datacite/nb-dois.txt
+++ b/1-enrich-with-datacite/nb-dois.txt
-2386
\ No newline at end of file
+2692
\ No newline at end of file
--- a/1-enrich-with-datacite/z_personal_functions.py
+++ b/1-enrich-with-datacite/z_personal_functions.py
--- a/2-produce-graph/hist-evol-datasets-per-repo.png
+++ b/2-produce-graph/hist-evol-datasets-per-repo.png
--- a/2-produce-graph/hist-evol-datasets-per-repo.py
+++ b/2-produce-graph/hist-evol-datasets-per-repo.py
@@ -94,8 +94,7 @@ for i, date in enumerate(df_evol.index) :
 ax.set_xticks(x_idx_toshow)
 ax.set_xticklabels(x_label_toshow, rotation=70, fontsize=10)

-
-plt.title(f"Evolution of the quantity of UGA open datasets\n and distribution per repository", \
+plt.title(f"Cumulative view of the quantity of UGA research data\n and distribution by repository", \
 	fontsize = 18, x = 0.5, y = 1.03, alpha = 0.8)
 plt.suptitle(f"n = {len(df)}", fontsize = 12, x = 0.5, y = 0.87, alpha = 0.6)


--- a/2-produce-graph/hist-last-datasets-by-client.png
+++ b/2-produce-graph/hist-last-datasets-by-client.png
--- a/2-produce-graph/hist-last-datasets-by-client.py
+++ b/2-produce-graph/hist-last-datasets-by-client.py
@@ -53,7 +53,7 @@ ax.tick_params(axis='both', which='major', labelsize=8)

 plt.legend(reverse = False)

-plt.title(f"Datasets registered over the last 30 days\ndistributed by DataCite client",
+plt.title(f"Research data registered over the last 30 days\ndistributed by DataCite client",
 	fontsize = 18, x = 0.5, y = 1.03, alpha = 0.8)
 plt.suptitle(f"n = {len(df_last_weeks)}", fontsize = 12, x = 0.5, y = 0.89, alpha = 0.6)


--- a/2-produce-graph/hist-quantity-year-type.png
+++ b/2-produce-graph/hist-quantity-year-type.png
--- a/2-produce-graph/hist-quantity-year-type.py
+++ b/2-produce-graph/hist-quantity-year-type.py
@@ -17,7 +17,7 @@ df_year_type.index.rename("year", inplace = True)

 ## a set of color via plt
 ### see color palett https://matplotlib.org/stable/users/explain/colors/colormaps.html
-colors = [plt.cm.tab20(i) for i in range(len(df_year_type.columns))]
+colors = [plt.cm.Set3(i) for i in range(len(df_year_type.columns))]

 ax = df_year_type.plot(
    kind = "bar", 
@@ -39,7 +39,7 @@ plt.xlabel(None)

 plt.legend(loc="center", reverse = True, bbox_to_anchor=(0.45, 0.65), fontsize = 10)

-plt.title(f"Distribution of datasets by registration year\nand DataCite types",
+plt.title(f"Distribution of research data by registration year\nand DataCite types",
 	fontsize = 18, x = 0.5, y = 1.03, alpha = 0.8)
 plt.suptitle(f"n = {len(df)}", fontsize = 12, x = 0.5, y = 0.90, alpha = 0.6)


--- a/2-produce-graph/pie--datacite-client.png
+++ b/2-produce-graph/pie--datacite-client.png
--- a/2-produce-graph/pie--datacite-type.png
+++ b/2-produce-graph/pie--datacite-type.png
--- a/2-produce-graph/pie-data-type.py
+++ b/2-produce-graph/pie-data-type.py
No results found