Compare revisions

5a958b88 · 56202a5c · 81ec7429 · 4d87c4eb · acab36c6 · 008af182
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,29 +3,57 @@ image: python:3-alpine
 actualisation_dois:

  only:
+    # restreindre au déclenchement automatique par schedule ou déclenchement manuel pour éviter de tourner en boucle (le commit produit par le pipeline qui déclenche le pipeline à nouveau)
    - schedules
    - web

+
  before_script:
+    # installations permettant de faire tourner git
    - apk update
    - apk add git openssh
+
  script:
+    # installation des bibliothèques python et exécution du script
    - pip install pandas
    - pip install requests
    - pip install matplotlib
    - python run-all-codes.py
+
  after_script:
+    # commit des changements suite à l'exécution du script
    - git config user.name "${GITLAB_USER_NAME}"
    - git config user.email "${GITLAB_USER_EMAIL}"
    - git remote set-url --push origin "https://PUSH_TOKEN:${ACCESS_TOKEN}@gricad-gitlab.univ-grenoble-alpes.fr/${CI_PROJECT_PATH}.git"
-    - git add -f dois-uga.csv 2-produce-graph/hist-evol-datasets-per-repo.png 2-produce-graph/hist-quantity-year-type.png 2-produce-graph/pie--datacite-client.png 2-produce-graph/pie--datacite-type.png
+    - git add -f dois-uga.csv dois-uga--last-500.csv 2-produce-graph/hist-evol-datasets-per-repo.png 2-produce-graph/hist-quantity-year-type.png 2-produce-graph/pie--datacite-client.png 2-produce-graph/pie--datacite-type.png 2-produce-graph/hist-last-datasets-by-client.png 1-enrich-with-datacite/all_datacite_clients_for_uga.csv 1-enrich-with-datacite/nb-dois.txt
    - git commit -m "Execution du pipeline. Actualisation des dois et des graphes."
-    - git push origin HEAD:$CI_COMMIT_REF_NAME
+    - git push origin HEAD:${CI_COMMIT_REF_NAME}
+
+    # création d'un espace accueillant le clone du repo du site web, et tests au cas où l'espace existe déjà
+    - cd ..
+    - if ! [ -d "cloned_repo" ]; then mkdir cloned_repo; fi
+    - if [ -d "cloned_repo/${PATH_TO_PUSH}" ]; then cd cloned_repo/${PATH_TO_PUSH}; git pull; else cd cloned_repo; git clone ${LINK_TO_CLONE}; fi
+    - cd -
+    # copier le fichier "nb-dois.txt" pour commit dans le repo du site web
+    - cp open-research-data-monitor-back/1-enrich-with-datacite/nb-dois.txt cloned_repo/${PATH_TO_PUSH}
+    - cd cloned_repo/${PATH_TO_PUSH}
+    # commit du fichier "nb-dois.txt" vers le repo du site web
+    - git config user.name "${GITLAB_USER_NAME}"
+    - git config user.email "${GITLAB_USER_EMAIL}"
+    - git remote set-url --push origin "https://PUSH_TOKEN2:${ACCESS_TOKEN2}@gricad-gitlab.univ-grenoble-alpes.fr/${PROJECT_PATH2}.git"
+    - git add -f nb-dois.txt
+    - git commit -m "Execution du pipeline. Actualisation du nombre de dois."
+    - git push origin HEAD:main
  
  artifacts:
+    # ajout des fichiers du dépôt qui ont été modifiés, au cas où un problème serait survenu dans "after_script"
    paths:
      - dois-uga.csv
+      - dois-uga--last-500.csv
      - 2-produce-graph/hist-evol-datasets-per-repo.png
      - 2-produce-graph/hist-quantity-year-type.png
      - 2-produce-graph/pie--datacite-client.png
      - 2-produce-graph/pie--datacite-type.png
+      - 2-produce-graph/hist-last-datasets-by-client.png
+      - 1-enrich-with-datacite/nb-dois.txt
+      - 1-enrich-with-datacite/all_datacite_clients_for_uga.csv
--- a/0-collect-data/datacite-dois.txt
+++ b/0-collect-data/datacite-dois.txt
--- a/0-collect-data/datacite.py
+++ b/0-collect-data/datacite.py
@@ -12,8 +12,8 @@ https://support.datacite.org/docs/can-i-see-more-detailed-affiliation-informatio

 * mémo : pour descendre au niveau des auteurs , le filtre `?person-id=orcid-nb`

-* nb 2024-01-13
-retrait de l'AAU (jbru.aau) car tout est du PD de congrès
+* note 2023-02
+retrait de l'AAU (jbru.aau) car tout est du PDF de congrès
 ajout de client.uid:inist.sshade et client.uid:inist.resif
 """

@@ -125,19 +125,6 @@ if main_no_dois :
    print("datasets with an other identifier than DOI has been finded")
    [print(f"\t\t{elem}") for elem in main_no_dois]

-## __________n__________ remove DOIs who are newer version via Figshare repository
-## example 10.6084/m9.figshare.23737431.v2
-## peut être refait lors de la récupération en véridiant que relatedIdentifiers/relationType:"IsIdenticalTo" ne soit présent
-doi_to_remove = []
-for doi in main_dois : 
-    if "figshare" in doi :
-        # synthaxe : remove the number of the version
-        doi_shorten = doi[: len(doi) - 1]
-        if doi_shorten.endswith(".v") : 
-            doi_to_remove.append(doi)
-
-[main_dois.remove(elem) for elem in doi_to_remove]
-
 ## __________n__________ remove duplicates
 unique_dois = list(set(main_dois))
 print(f"\tNb of unique DOI\t{len(unique_dois)}")

--- a/0-collect-data/nakala-uga-users.txt
+++ b/0-collect-data/nakala-uga-users.txt
@@ -17,4 +17,18 @@ mbeligne
 acarbonnelle
 annegf
 tleduc
-abey
\ No newline at end of file
+abey
+mbarletta
+lmaritaud
+jbeaureder
+kboczon
+llacoste
+fcorsi
+ecarlier
+lvanbogaert
+nrousselot
+jlevy1
+mflecheux
+pbai
+ymonnier
+slecuyerchardevel
\ No newline at end of file
--- a/0-collect-data/rdg.py
+++ b/0-collect-data/rdg.py
@@ -21,16 +21,16 @@ urls = [
    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3AUGA',
    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3AUGA',
    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3AUGA',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=datasetContactAffiliation%3AGrenoble',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3AGrenoble',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3AGrenoble',
-    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3AGrenoble'
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=datasetContactAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3A(Grenoble AND Alpes)'
    # possiblilité d'ajouter d'autres requêtes
 ]

 # on définit une fonction pour la lancer la requete avec chaque url pour les différentes affiliations
 def get_results(url):
-    req = requests.get(url)
+    req = requests.get(url+"&type=dataset")
    #print(req.url)
    results = [req.json()]
    
@@ -39,7 +39,7 @@ def get_results(url):
    count = nb_res
    page = 1
    while(nb_res > 0):
-        newurl = url+"&start="+str(count)
+        newurl = url+"&type=dataset"+"&start="+str(count)
        req = requests.get(newurl)
        results.append(req.json())
        nb_res = results[page]["data"]["count_in_response"]
@@ -59,7 +59,7 @@ def get_dois(results):
        nb_dois += len(num_dois)

        for item in num_dois :
-            dois.append(item["global_id"])
+            dois.append(item.get("global_id"))
        
    print("\tnb DOIs\t\t" + str(nb_dois))
    return dois

--- a/0-collect-data/zenodo-dois.txt
+++ b/0-collect-data/zenodo-dois.txt
--- a/0-collect-data/zenodo.py
+++ b/0-collect-data/zenodo.py
@@ -5,7 +5,6 @@
 """
 ## todo
 - v2 : rechercher UGA comme financeur `grants.funder.doi`
- v2 : passer par les ORCID des `creator.orcid` et `contributors.orcid`

 ## Documentation
 * Liste des métadonnées obligatoires lors du dépôts (upload_type, sub_type, publication_date, titre, creators, ) https://developers.zenodo.org/#representation
@@ -13,12 +12,9 @@
 * Doc dev API champs de requêtes classsiques https://developers.zenodo.org/#records
 * doc champs poussées pour la recherche https://help.zenodo.org/guides/search/
 * typologie des dépôts possiblent : publication: Publication, poster: Poster, presentation: Presentation, Dataset: Dataset, image: Image, video: Video/Audio, software: Software, lesson: Lesson, physicalobject: Physical object, other: Other
+* descendre au niveau des ORCID des `creator.orcid` et `contributors.orcid`


-### Identifier les dépôts : 
- utilisation du champs `creators.affiliation` et contributor.affiliation (multi affiliation)
- utilisation de la forme "grenoble" uniquement, possibilité de bruit
-
 ## Notes sur la récupération
 - exemple résultats de requete : https://zenodo.org/api/records?q=creators.affiliation%3A*grenoble*&type=dataset&page=6&size=100&sort=mostrecent&all_version=False
 - deux DOI identiques sont présents : un à la racine `[hits][doi]` et un autre dans `[hits][metadata][doi]`
@@ -29,13 +25,6 @@ import requests, json

 print("\n\nRunning zenodo.py")

-
-
-with open("personnal-keys.json") as f : 
-    ## load zenodo keys for requests the API
-    ACCESS_TOKEN = json.load(f)["ZENODO_KEY"]
-
-
 def req_zenodo_with_page(uga_perimeter, record_type, page_nb) :
    """
    retourne les jeux de données UGA depuis Zenodo
@@ -51,7 +40,6 @@ def req_zenodo_with_page(uga_perimeter, record_type, page_nb) :
            "size" : 100,
            "sort" : "mostrecent",
            "all_version" : False,
-            "access_tpoken" : ACCESS_TOKEN
            } 
    )
    # for debugging 
@@ -118,12 +106,20 @@ def req_zenodo(uga_perimeter, record_type) :

 all_dois = set() # a set to gather all DOIs

-uga_perimeter = "creators.affiliation:*grenoble* contributors.affiliation:*grenoble*"
+uga_txt_query = "(\"grenoble alpes\" OR \"grenoble alps\" OR \"grenoble INP\" \
+OR \"polytechnique de grenoble\" OR \"Grenoble Institute of Technology\" OR \"univeristé de grenoble\" )"
+
+uga_query = f"creators.affiliation:{uga_txt_query} contributors.affiliation:{uga_txt_query}"
+
+## memo 2024-02 two fields following by space will search in first field or in second field
+## ne pas faire de recherche avec AND car ça recherche dans toutes les affiliations des auteurs
+## SceincesPo Grenoble n'apporte pas de résultat https://zenodo.org/search?q=creators.affiliation%3A%28sciencespo%20AND%20Grenoble%29&q=type%3Adataset&l=list&p=1&s=10&sort=bestmatch
+
 types_to_req = ["dataset", "image", "video", "software", "other"]

 for record_type in types_to_req : 

-    temp_dois = req_zenodo(uga_perimeter, record_type)
+    temp_dois = req_zenodo(uga_query, record_type)

    ## placer les DOI dans le buffer général qui ne peut contenir que des valeurs uniques
    [all_dois.add(doi) for doi in temp_dois]

--- a/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
+++ b/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
+client,count,name,year,url
+cern.zenodo,885,Zenodo,2013,https://zenodo.org/
+inist.sshade,522,Solid Spectroscopy Hosting Architecture of Databases and Expertise,2019,https://www.sshade.eu/
+figshare.ars,380,figshare Academic Research System,2016,http://figshare.com/
+inist.osug,275,Observatoire des Sciences de l'Univers de Grenoble,2014,http://doi.osug.fr
+dryad.dryad,168,DRYAD,2018,https://datadryad.org
+inist.resif,99,Réseau sismologique et géodésique français,2014,https://www.resif.fr/
+rdg.prod,81,Recherche Data Gouv France,2022,https://recherche.data.gouv.fr/en
+inist.humanum,75,NAKALA,2020,https://nakala.fr
+inist.persyval,64,PERSYVAL-Lab : Pervasive Systems and Algorithms Lab,2016,
+fmsh.prod,28,Fondation Maison des sciences de l'homme,2023,
+inist.ccj,22,Centre Camille Jullian – UMR 7299,2020,
+pangaea.repository,18,PANGAEA,2020,https://www.pangaea.de/
+mcdy.dohrmi,14,dggv-e-publications,2020,https://www.dggv.de/publikationen/dggv-e-publikationen.html
+inist.cirm,7,Centre International de Rencontres Mathématiques,2017,
+figshare.sage,6,figshare SAGE Publications,2018,
+iris.iris,5,NSF Seismological Facility for the Advancement of Geoscience (SAGE),2018,http://www.iris.edu/hq/
+vqpf.dris,3,Direction des ressources et de l'information scientifique,2021,
+tib.gfzbib,3,GFZpublic,2011,https://gfzpublic.gfz-potsdam.de
+tib.repod,3,RepOD,2015,https://repod.icm.edu.pl/
+cnic.sciencedb,3,ScienceDB,2022,https://www.scidb.cn/en
+inist.eost,2,Ecole et Observatoire des Sciences de la Terre,2017,https://eost.unistra.fr/en/
+tib.gfz,2,GFZ Data Services,2011,https://dataservices.gfz-potsdam.de/portal/
+bl.mendeley,2,Mendeley Data,2015,https://data.mendeley.com/
+bl.nerc,2,NERC Environmental Data Service,2011,https://eds.ukri.org
+tug.openlib,2,TU Graz OPEN Library,2020,https://openlib.tugraz.at/
+crui.ingv,2,Istituto Nazionale di Geofisica e Vulcanologia (INGV),2013,http://data.ingv.it/
+ugraz.unipub,2,unipub,2019,http://unipub.uni-graz.at
+ethz.sed,2,"Swiss Seismological Service, national earthquake monitoring and hazard center",2013,http://www.seismo.ethz.ch
+inist.opgc,1,Observatoire de Physique du Globe de Clermont-Ferrand,2017,
+ethz.da-rd,1,ETHZ Data Archive - Research Data,2013,http://data-archive.ethz.ch
+ethz.zora,1,"Universität Zürich, ZORA",2013,https://www.zora.uzh.ch/
+estdoi.ttu,1,TalTech,2019,https://digikogu.taltech.ee
+repod.dbuw,1,University of Warsaw Research Data Repository,2023,https://danebadawcze.uw.edu.pl/
+inist.ird,1,IRD,2016,
+inist.omp,1,Observatoire Midi-Pyrénées,2011,
+umass.uma,1,University of Massachusetts (UMass) Amherst,2018,https://scholarworks.umass.edu/
+edi.edi,1,Environmental Data Initiative,2017,https://portal.edirepository.org/nis/home.jsp
+bl.iita,1,International Institute of Tropical Agriculture datasets,2017,http://data.iita.org/
+ardcx.nci,1,National Computational Infrastructure,2020,
+ihumi.pub,1,IHU Méditerranée Infection,2020,
+inist.inrap,1,Institut national de recherches archéologiques préventives,2019,
+tib.mpdl,1,Max Planck Digital Library,2015,
+tudublin.arrow,1,ARROW@TU Dublin,2020,https://arrow.dit.ie/
--- a/1-enrich-with-datacite/concatenate-enrich-dois.py
+++ b/1-enrich-with-datacite/concatenate-enrich-dois.py
@@ -20,7 +20,7 @@ print("\tDOIs to treat\t\t", len(dois))

 ## pour essayer avec un seul DOI
 # temp_doi = dois[random.randint(0, len(dois))]
-# #temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509
+# temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509
 # print(temp_doi)
 # raw_metadatas = my_functions.get_md_from_datacite(temp_doi)

@@ -28,6 +28,7 @@ doi_error = [] # retrieve doi error
 temp_rows = [] # put data in dict before df

 df_old = pd.read_csv("../dois-uga.csv")
+
 print(f"\n\tnb of DOIs already treated\t{len(df_old)}")

 # req dataCite and paste data following instructions
@@ -52,10 +53,70 @@ for doi in dois : #[:300]
 	temp_rows.append(selected_md) ## ajouter ce dictionnaire à une liste
 	print(f"\tadded\t\t{doi}")

-
+## if new datasets has been founded
 if temp_rows :
 	df_fresh = pd.DataFrame(temp_rows)
-	df_out = pd.concat([df_old, df_fresh], ignore_index=True)
+	dois_added = list(df_old["doi"])
+	to_del = []
+	for i in range(0, len(df_fresh)):
+		result = my_functions.get_origin_version(df_fresh.loc[i, "doi"])
+		if result[0] not in dois_added: 
+			dois_added.append(result[0])
+			df_fresh.loc[i, "doi"] = result[0]
+			if str(result[1]) != "[]": df_fresh.loc[i, "traveled_dois"] = str(result[1])
+			else: df_fresh.loc[i, "traveled_dois"] = ""	
+			if str(result[2]) != "[]": df_fresh.loc[i, "all_relations"] = str(result[2])
+			else: df_fresh.loc[i, "all_relations"] = ""
+		else:
+			to_del.append(i)
+			
+	df_fresh.drop(to_del, inplace=True)
+	print("Nombre de dois supprimés : " + str(len(to_del)))
+	
+	print("Nb dois a garder : " + str(len(dois_added)))
+
+	df_concat = pd.concat([df_old, df_fresh], ignore_index=True)
+
+	## remove not wanted datacite type & clients
+	type_to_explude = ["Book", "ConferencePaper", "ConferenceProceeding", "JournalArticle", "BookChapter", "Service", "Preprint"]
+	clients_to_exclude = ["rg.rg", "inist.epure"]
+
+	df_out = df_concat[ ~df_concat["resourceTypeGeneral"].isin(type_to_explude) & ~df_concat["client"].isin(clients_to_exclude) ].copy()
+	
+	## output main CSV
 	df_out.to_csv("../dois-uga.csv", index = False)
 	print(f"\n\nnb of doi exported \t{len(df_out)}")

+
+	# write the number of dois found in a file to display on the website
+	with open("nb-dois.txt", 'w') as outf :
+		outf.write(str(len(df_out)))
+
+
+	## output last 500 DOIs to make it easier to open in web tools
+	df_last_dois = df_out.sort_values(by = "created", ascending = False, inplace = False)[:500]
+	df_last_dois["created"] = df_last_dois["created"].str[:10]
+	df_last_dois[["doi", "client", "resourceTypeGeneral", "created", "publisher", "rights", "sizes"]].to_csv("../dois-uga--last-500.csv", index = False)
+
+
+	## for the website : output another csv with datacite client and number of datasets
+	df_client_raw = df_out["client"].value_counts().to_frame()
+
+	## get informations about each client
+	client_names = []
+	client_years = []
+	client_urls = []
+	for i in range(0, len(df_client_raw)):
+		client = df_client_raw.iloc[i].name
+		req = requests.get('https://api.datacite.org/clients?query=id:%22'+str(client)+'%22')
+		client_names.append(req.json()["data"][0]["attributes"]["name"])
+		client_years.append(req.json()["data"][0]["attributes"]["year"])
+		client_urls.append(req.json()["data"][0]["attributes"]["url"])
+	
+	## add informations to the output csv
+	df_client_raw["name"] = client_names
+	df_client_raw["year"] = client_years
+	df_client_raw["url"] = client_urls
+	df_client_raw.to_csv("all_datacite_clients_for_uga.csv")
+
+
--- a/1-enrich-with-datacite/nb-dois.txt
+++ b/1-enrich-with-datacite/nb-dois.txt
+2692
\ No newline at end of file
--- a/1-enrich-with-datacite/z_personal_functions.py
+++ b/1-enrich-with-datacite/z_personal_functions.py
--- a/2-produce-graph/all_datacite_clients.csv
+++ b/2-produce-graph/all_datacite_clients.csv
--- a/2-produce-graph/hist-evol-datasets-per-repo.png
+++ b/2-produce-graph/hist-evol-datasets-per-repo.png
--- a/2-produce-graph/hist-evol-datasets-per-repo.py
+++ b/2-produce-graph/hist-evol-datasets-per-repo.py
--- a/2-produce-graph/hist-last-datasets-by-client.png
+++ b/2-produce-graph/hist-last-datasets-by-client.png
--- a/2-produce-graph/hist-last-datasets-by-client.py
+++ b/2-produce-graph/hist-last-datasets-by-client.py
--- a/2-produce-graph/hist-quantity-year-type.png
+++ b/2-produce-graph/hist-quantity-year-type.png
--- a/2-produce-graph/hist-quantity-year-type.py
+++ b/2-produce-graph/hist-quantity-year-type.py
--- a/2-produce-graph/pie--datacite-client.png
+++ b/2-produce-graph/pie--datacite-client.png
--- a/2-produce-graph/pie--datacite-type.png
+++ b/2-produce-graph/pie--datacite-type.png
No results found