Compare revisions

61a64fcb · 31859d11 · b34d1de1 · 81509eff · 3e515446 · af08f681
--- a/.gitignore
+++ b/.gitignore
 /0-collect-data/personnal-keys.json
 /0-collect-data/bso/
-
-
-/hide
\ No newline at end of file
+/1-enrich-with-datacite/__pycache__/
+/2-produce-graph/__pycache__/
+/hide
+0-collect-data/.ipynb_checkpoints/z-datacite-demo-checkpoint.ipynb
+0-collect-data/.ipynb_checkpoints/z-resultats-demo-datacite-checkpoint.csv
+0-collect-data/z-resultats-demo-datacite.csv
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
+image: python:3-alpine
+
+actualisation_dois:
+
+  only:
+    # restreindre au déclenchement automatique par schedule ou déclenchement manuel pour éviter de tourner en boucle (le commit produit par le pipeline qui déclenche le pipeline à nouveau)
+    - schedules
+    - web
+
+
+  before_script:
+    # installations permettant de faire tourner git
+    - apk update
+    - apk add git openssh
+
+  script:
+    # installation des bibliothèques python et exécution du script
+    - pip install pandas
+    - pip install requests
+    - pip install matplotlib
+    - python run-all-codes.py
+
+  after_script:
+    # commit des changements suite à l'exécution du script
+    - git config user.name "${GITLAB_USER_NAME}"
+    - git config user.email "${GITLAB_USER_EMAIL}"
+    - git remote set-url --push origin "https://PUSH_TOKEN:${ACCESS_TOKEN}@gricad-gitlab.univ-grenoble-alpes.fr/${CI_PROJECT_PATH}.git"
+    - git add -f dois-uga.csv dois-uga--last-500.csv 2-produce-graph/hist-evol-datasets-per-repo.png 2-produce-graph/hist-quantity-year-type.png 2-produce-graph/pie--datacite-client.png 2-produce-graph/pie--datacite-type.png 2-produce-graph/hist-last-datasets-by-client.png 1-enrich-with-datacite/all_datacite_clients_for_uga.csv 1-enrich-with-datacite/nb-dois.txt
+    - git commit -m "Execution du pipeline. Actualisation des dois et des graphes."
+    - git push origin HEAD:${CI_COMMIT_REF_NAME}
+
+    # création d'un espace accueillant le clone du repo du site web, et tests au cas où l'espace existe déjà
+    - cd ..
+    - if ! [ -d "cloned_repo" ]; then mkdir cloned_repo; fi
+    - if [ -d "cloned_repo/${PATH_TO_PUSH}" ]; then cd cloned_repo/${PATH_TO_PUSH}; git pull; else cd cloned_repo; git clone ${LINK_TO_CLONE}; fi
+    - cd -
+    # copier le fichier "nb-dois.txt" pour commit dans le repo du site web
+    - cp open-research-data-monitor-back/1-enrich-with-datacite/nb-dois.txt cloned_repo/${PATH_TO_PUSH}
+    - cd cloned_repo/${PATH_TO_PUSH}
+    # commit du fichier "nb-dois.txt" vers le repo du site web
+    - git config user.name "${GITLAB_USER_NAME}"
+    - git config user.email "${GITLAB_USER_EMAIL}"
+    - git remote set-url --push origin "https://PUSH_TOKEN2:${ACCESS_TOKEN2}@gricad-gitlab.univ-grenoble-alpes.fr/${PROJECT_PATH2}.git"
+    - git add -f nb-dois.txt
+    - git commit -m "Execution du pipeline. Actualisation du nombre de dois."
+    - git push origin HEAD:main
+  
+  artifacts:
+    # ajout des fichiers du dépôt qui ont été modifiés, au cas où un problème serait survenu dans "after_script"
+    paths:
+      - dois-uga.csv
+      - dois-uga--last-500.csv
+      - 2-produce-graph/hist-evol-datasets-per-repo.png
+      - 2-produce-graph/hist-quantity-year-type.png
+      - 2-produce-graph/pie--datacite-client.png
+      - 2-produce-graph/pie--datacite-type.png
+      - 2-produce-graph/hist-last-datasets-by-client.png
+      - 1-enrich-with-datacite/nb-dois.txt
+      - 1-enrich-with-datacite/all_datacite_clients_for_uga.csv
--- a/0-collect-data/datacite-dois.txt
+++ b/0-collect-data/datacite-dois.txt
--- a/0-collect-data/datacite.py
+++ b/0-collect-data/datacite.py
+# récupérer les DOIs de l'UGA depuis Datacite
+## 2023-12-01, Elias Chetouane, Maxence Larrieu
+
+"""
+
+## Doc
+* datacite API : https://support.datacite.org/docs/api
+* Recherche d'un DOI : https://support.datacite.org/docs/api-sampling
+* Pagination : https://support.datacite.org/docs/pagination
+* nb : "DataCite added support for affiliation identifiers in Metadata Schema 4.3, released August 2019"
+https://support.datacite.org/docs/can-i-see-more-detailed-affiliation-information-in-the-rest-api
+
+* mémo : pour descendre au niveau des auteurs , le filtre `?person-id=orcid-nb`
+
+* note 2023-02
+retrait de l'AAU (jbru.aau) car tout est du PDF de congrès
+ajout de client.uid:inist.sshade et client.uid:inist.resif
+"""
+
+def get_results(query_from_list, view_results):
+    """
+    lance les requêtes dans l'API 
+    """
+
+    query_root = "https://api.datacite.org/dois?query="
+    
+    # les type de datasets à exclure (à voir si on retire text)
+    # note 2023-12-15 ça retire seulement 15 DOIs
+    query_filter_type = " AND resource_type_id:(-book -bookChapter -conferencePaper -conferenceProceeding -dissertation -event -journal -journalArticle -peerReview -preprint -report -service)"
+
+    query_page = "&page[size]=100"
+
+    req = requests.get(query_root + query_from_list +  query_filter_type + query_page)
+    results = [req.json()]
+    
+    if view_results: 
+        # print total dataset per query
+        print(f"\n\t{req.url}")
+        print(f"\t{results[0]['meta']['total']}")
+
+    # obtenir les résultats de chaque page dans la liste results
+    nb_pages = results[0]["meta"]["totalPages"]
+    page = 1
+    while(page < nb_pages):
+        url = (results[page-1]["links"]["next"])
+        req = requests.get(url)
+        results.append(req.json())
+        page += 1
+    return results
+
+
+def get_dois(results):
+    """
+    Ajouter les datasets avec DOI dans une liste et identifier ceux sans DOI
+    """
+    temp_dois = []
+
+    # prendre en compte les résultats de chaque page
+    for res in results:
+        num_dois = res["data"]
+
+        for item in num_dois :
+            item_id = item["id"]
+
+            # si le dataset a un id qui correspond à un DOI
+            if item.get("type") == "dois":
+                temp_dois.append(item_id)
+            else:
+                ## si l'id n'est pas un DOI on l'ajoute ds une liste globale
+                main_no_dois.append(item_id)
+    
+    return temp_dois
+
+
+print("\n\nRunning datacite.py")
+
+import requests, pandas as pd
+main_dois = []
+main_no_dois = []
+
+## __________0__________ query with all RORs from UGA on authors and contributors
+
+### load table containing finded ROR
+df_raw = pd.read_csv("uga-find-ror-childs/UGA-ror-childs--2023-12-29--manual.csv")
+#print("columns name: ", [colname for colname in df_raw.columns])
+
+### select accurate ROR
+df_hal = df_raw[ df_raw["docid"].notna() ] ## RORs finded via HAL
+df_ror = df_raw[ df_raw["Unnamed: 6"] == "include"] ## RORs selected manually /!\ col name
+rors = df_hal.ror.tolist() + df_ror.ror.tolist() 
+rors.append("https://ror.org/02rx3b187") ## add the ROR from UGA ! 
+print(f"\t__process by ROR\n\tnb of ROR loaded\t{len(rors)}")
+
+# debug : try only with UGA ROR
+# rors = ["https://ror.org/02rx3b187"]
+
+for ror in rors : # to debug add [:1] 
+    for auth_type in ["creators", "contributors"] :
+        query = f"{auth_type}.affiliation.affiliationIdentifier:\"{ror}\""
+        temp_doi_list = get_dois(get_results(query, False))
+        [main_dois.append(elem) for elem in temp_doi_list]
+
+print(f"\tnb DOIs finded \t{len(main_dois)}")
+
+
+## __________1__________ query by datacite client and UGA as pupublisher
+print(f"\n\t__process by datacite clients")
+query_client_publisher = [
+    "client_id:inist.osug",             # OSUG https://doi.osug.fr/
+    "client.uid:inist.sshade",          # services nationaux d'observation portés par l'OSUG
+    "client.uid:inist.resif",           # services nationaux d'observation portés par l'OSUG
+    "client_id:inist.persyval",         # Labex Persyval-lab (PIA)
+    "publisher:(grenoble AND alpes)"    # /!\ apporte du text
+]
+
+for query in query_client_publisher :
+    temp_doi_list = get_dois(get_results(query, True))
+    [main_dois.append(elem) for elem in temp_doi_list]
+
+print(f"\tnb DOI finded \t{len(main_dois)}")
+
+
+## __________n__________ if Datasets with other things that a DOI have been finded
+if main_no_dois : 
+    print("datasets with an other identifier than DOI has been finded")
+    [print(f"\t\t{elem}") for elem in main_no_dois]
+
+## __________n__________ remove duplicates
+unique_dois = list(set(main_dois))
+print(f"\tNb of unique DOI\t{len(unique_dois)}")
+
+# __________z__________ export DOIs in txt files
+with open("datacite-dois.txt", 'w') as f :
+    [f.write(f"{line}\n") for line in unique_dois]
+
--- a/0-collect-data/nakala-dois.txt
+++ b/0-collect-data/nakala-dois.txt
-10.34847/nkl.6caam3dp
-10.34847/nkl.76abr599
 10.34847/nkl.a0fe865m
+10.34847/nkl.76abr599
+10.34847/nkl.6caam3dp
 10.34847/nkl.5bcck3cz
 10.34847/nkl.ca709965
 10.34847/nkl.ca8dmbdh
@@ -8,8 +8,8 @@
 10.34847/nkl.748eqz51
 10.34847/nkl.2c0fj3ai
 10.34847/nkl.2bad8uj6
-10.34847/nkl.87788ro3
 10.34847/nkl.2404plkh
+10.34847/nkl.87788ro3
 10.34847/nkl.bafagy29
 10.34847/nkl.9bd4vqc6
 10.34847/nkl.4540o25d
@@ -22,5 +22,11 @@
 10.34847/nkl.3dbc2mtb
 10.34847/nkl.bc2b1071
 10.34847/nkl.81dcdekj
-10.34847/nkl.b1cb3arm
-10.34847/nkl.c9e92or4
+10.34847/nkl.ef903o6v
+10.34847/nkl.ae94a74k
+10.34847/nkl.bf5f263z
+10.34847/nkl.9f85iol5
+10.34847/nkl.345bf9i7
+10.34847/nkl.9cd8hi4k
+10.34847/nkl.e1e41vdi
+10.34847/nkl.deb655as
--- a/0-collect-data/nakala-uga-users.txt
+++ b/0-collect-data/nakala-uga-users.txt
@@ -15,4 +15,20 @@ egreslou
 troulet
 mbeligne
 acarbonnelle
-annegf
\ No newline at end of file
+annegf
+tleduc
+abey
+mbarletta
+lmaritaud
+jbeaureder
+kboczon
+llacoste
+fcorsi
+ecarlier
+lvanbogaert
+nrousselot
+jlevy1
+mflecheux
+pbai
+ymonnier
+slecuyerchardevel
\ No newline at end of file
--- a/0-collect-data/nakala.py
+++ b/0-collect-data/nakala.py
@@ -68,18 +68,20 @@ def get_dois_n_users(user_id):

 import json, requests

-parent_folder = "0-collect-data"
+print("\n\nRunning nakala.py")
+
+
 ## list to stock datas
 nakala_uga_users = []
 all_dois = []
 other_user_finded = []

-## importer les users depuis le fichier txt
-with open(parent_folder + '/nakala-uga-users.txt', 'r') as f:
+## load nakala users from txt file
+with open('nakala-uga-users.txt', 'r') as f:
    ## attention bien stripper quand on import du txt sinon les sauts de ligne sont présents
    [nakala_uga_users.append( user.strip() ) for user in f.readlines()]

-print("nb d'utilisateur Nakala UGA importés", len(nakala_uga_users))
+print("\tnb nakala users loaded", len(nakala_uga_users))

 # ____n____ iterer sur les users uga
 for user in nakala_uga_users : 
@@ -88,7 +90,7 @@ for user in nakala_uga_users :
    res = get_dois_n_users(user)
    
    if res["continue"] :
-        print(f"{user}\n\tnb DOI {len(res['content'])}" )
+        #print(f"{user}\n\tnb DOI {len(res['content'])}" )
        
        ## ajouter les DOIs trouvés
        if len(res["content"]) > 0 : 
@@ -99,14 +101,17 @@ for user in nakala_uga_users :
            #print(f"\nnew person {','.join(res['other_users'])}")
            other_user_finded += [x for x in res["other_users"] ]

+print(f"\tnb dois finded\t\t{len(all_dois)}")
+

 ## ____n____ exporter les DOI au format txt

-with open(parent_folder + "/nakala-dois.txt", 'w') as fh :
+with open("nakala-dois.txt", 'w') as fh :
    [fh.write(f"{line}\n") for line in all_dois]

-## print les autres utilisateurs trouvés7
-print("\n\n nakala user trouvés ")
-for elem in other_user_finded : 
-    print(elem)
+## print les autres utilisateurs trouvés
+if other_user_finded : 
+    print("\n\n\tnakala new user finded ")
+    for elem in other_user_finded : 
+        print(f"\t\t\t{elem}")

--- a/0-collect-data/rdg-dois.txt
+++ b/0-collect-data/rdg-dois.txt
+10.57745/XHQ7TL
+10.15454/O93984
+10.57745/3D4DFW
+10.57745/69UNAM
+10.57745/TVAHUQ
+10.57745/3VMB3Y
+10.57745/TOR3SF
+10.57745/OVCWQN
+10.57745/GRHRZJ
+10.57745/Z3BG2U
+10.57745/QCVYG3
+10.15454/M7OK9E
+10.57745/MXEMI4
+10.57745/BYWEA3
+10.57745/7RFNNP
+10.57745/B6PSX0
+10.57745/JOZ1NA
+10.57745/OT1IFB
+10.57745/NGC4J0
+10.57745/ID1LS6
+10.57745/LPJ2S2
+10.57745/GZKUZS
+10.57745/LXTWNG
+10.57745/KTFZQD
+10.57745/ENJADK
+10.57745/NZFWP9
+10.57745/NOHRHJ
+10.57745/RUQLJL
+10.57745/UOGRPY
+10.57745/5O6QIH
+10.57745/J2A44Q
+10.57745/7HF7KG
+10.57745/W9N5Z9
+10.15454/8UIA76
+10.57745/HZDPTT
+10.57745/IZHDPC
+10.57745/52HT2L
+10.57745/CM2WOI
+10.57745/R1NIKK
+10.57745/QOA1QO
+10.57745/LUTMNE
+10.57745/YWBDQQ
--- a/0-collect-data/rdg.py
+++ b/0-collect-data/rdg.py
+# Récupérer les DOIs de l'UGA depuis Rechrche Data Gouv
+## 2023-12-01, Elias Chetouane
+
+"""
+## Documentation
+* Noms des champs à requêter : https://entrepot.recherche.data.gouv.fr/api/metadatablocks/citation
+* Doc API Dataverse : https://guides.dataverse.org/en/5.12.1/api/search.html
+
+## Remarques
+- Problème : certaines affiliations renseignent la ville, mais le labo n'est pas affilié à l'UGA. Cela les fait apparaitre alors qu'il ne faudrait pas les prendre en compte...
+- Obligation de cherche Greoble et UGA car champ libre, donc parfois l'affiliation à l'UGA est renseignée par UGA et parfois Univ Greoble Alpes
+- Adapter une fois mise en place du ROR
+"""
+
+import requests
+
+# requetes de base sur chaque champ qui nous intéresse : affiliation du contact, auteurs, producteur et contributeurs
+# dans chaque champ, recherche "UGA" ou "Grenoble"
+urls = [
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=datasetContactAffiliation%3AUGA',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3AUGA',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3AUGA',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3AUGA',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=datasetContactAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=authorAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=producerAffiliation%3A(Grenoble AND Alpes)',
+    'https://entrepot.recherche.data.gouv.fr/api/search?q=*&fq=contributorAffiliation%3A(Grenoble AND Alpes)'
+    # possiblilité d'ajouter d'autres requêtes
+]
+
+# on définit une fonction pour la lancer la requete avec chaque url pour les différentes affiliations
+def get_results(url):
+    req = requests.get(url+"&type=dataset")
+    #print(req.url)
+    results = [req.json()]
+    
+    # obtenir les résultats de chaque page dans la liste results
+    nb_res = results[0]["data"]["count_in_response"]
+    count = nb_res
+    page = 1
+    while(nb_res > 0):
+        newurl = url+"&type=dataset"+"&start="+str(count)
+        req = requests.get(newurl)
+        results.append(req.json())
+        nb_res = results[page]["data"]["count_in_response"]
+        count += nb_res
+        page += 1
+    return results
+
+# on crée une fonction pour ajouter les DOIs dans une liste
+
+def get_dois(results):
+    dois = []
+    nb_dois = 0
+
+    # prendre en compte les résultats de chaque page
+    for res in results:
+        num_dois = res["data"]["items"]
+        nb_dois += len(num_dois)
+
+        for item in num_dois :
+            dois.append(item.get("global_id"))
+        
+    print("\tnb DOIs\t\t" + str(nb_dois))
+    return dois
+
+print("\n\nRunning rdg.py")
+
+# on récupère les dois
+dois = []
+
+for url in urls:
+    dois += get_dois(get_results(url))
+
+# on supprime les doublons
+
+unique_dois = list(set(dois))
+
+print("\tnb DOIs uniques\t\t" + str(len(unique_dois)))
+
+# exporter la liste de DOI au format txt
+
+with open("rdg-dois.txt", 'w') as f :
+    # memo [4:] pour retirer "doi:" au début de chaque lignes
+    [f.write(f"{line[4:]}\n") for line in unique_dois]
+
--- a/0-collect-data/uga-find-ror-childs/UGA-ror-childs--2023-12-29--manual.csv
+++ b/0-collect-data/uga-find-ror-childs/UGA-ror-childs--2023-12-29--manual.csv
+ror,name,type,docid,hal_child_nb,ror_relation_nb,,
+https://ror.org/05588ks88,LInguistique et DIdactique des Langues Étrangères et Maternelles,laboratory,1043147,0.0,,,
+https://ror.org/01yxtfe92,École nationale supérieure d'architecture de Grenoble,institution,1043352,1.0,,,
+https://ror.org/05hz99a17,"Ambiances, Architectures, Urbanités",laboratory,1088635,0.0,,,
+https://ror.org/0509qp208,Centre d'études et de recherches appliquées à la gestion,laboratory,1043181,0.0,,,
+https://ror.org/05sbt2524,Institut polytechnique de Grenoble - Grenoble Institute of Technology,institution,1043329,15.0,,,
+https://ror.org/02z8yps18,"Laboratoire de Génie des Procédés pour la Bioraffinerie, les Matériaux Bio-sourcés et l'Impression Fonctionnelle",laboratory,1162332,0.0,,,
+https://ror.org/04eg25g76,Laboratoire de Conception et d'Intégration des Systèmes,laboratory,1043068,0.0,,,
+https://ror.org/02wrme198,Grenoble Images Parole Signal Automatique,laboratory,1043333,0.0,,,
+https://ror.org/043pfpy19,Laboratoire des Écoulements Géophysiques et Industriels [Grenoble],laboratory,1043048,0.0,,,
+https://ror.org/03985kf35,Translational Innovation in Medicine and Complexity / Recherche Translationnelle et Innovation en Médecine et Complexité - UMR 5525,laboratory,1043049,0.0,,,
+https://ror.org/03bcdsr62,"Laboratoire sols, solides, structures - risques [Grenoble]",laboratory,1043064,0.0,,,
+https://ror.org/04dbzz632,Institut Néel,laboratory,1043183,0.0,,,
+https://ror.org/05hyx5a17,Laboratoire de Génie Electrique de Grenoble,laboratory,1043220,0.0,,,
+https://ror.org/01c8rcg82,Laboratoire d'Informatique de Grenoble,laboratory,1043301,0.0,,,
+https://ror.org/05afmzm11,VERIMAG,laboratory,1043148,0.0,,,
+https://ror.org/04ett5b41,Laboratoire Jean Kuntzmann,laboratory,1043077,0.0,,,
+https://ror.org/03taa9n66,"Institut de Microélectronique, Electromagnétisme et Photonique - Laboratoire d'Hyperfréquences et Caractérisation",laboratory,1043224,0.0,,,
+https://ror.org/03f0apy98,Laboratoire de Physique Subatomique et de Cosmologie,laboratory,1043144,0.0,,,
+https://ror.org/00fwjkb59,Laboratoire d'Economie Appliquée de Grenoble,laboratory,1043256,0.0,,,
+https://ror.org/04axb9j69,Laboratoire d'Electrochimie et de Physico-chimie des Matériaux et des Interfaces,laboratory,1043074,0.0,,,
+https://ror.org/04as3rk94,[GIN] Grenoble Institut des Neurosciences,regrouplaboratory,408885,0.0,,,
+https://ror.org/05rwrfh97,Institut Fourier,laboratory,1043234,0.0,,,
+https://ror.org/036zswm25,Laboratoire des technologies de la microélectronique,laboratory,1043042,0.0,,,
+https://ror.org/023n9q531,Laboratoire Interdisciplinaire de Physique [Saint Martin d’Hères],laboratory,1043294,0.0,,,
+https://ror.org/026j45x50,"Pacte, Laboratoire de sciences sociales",laboratory,1043180,0.0,,,
+https://ror.org/02mc6qk71,Laboratoire de physique et modélisation des milieux condensés,laboratory,1043152,0.0,,,
+https://ror.org/04szabx38,Institut de biologie structurale,department,1043235,0.0,,,
+https://ror.org/02dpnb389,Clinatec - Centre de recherche biomédicale Edmond J.Safra,department,416269,0.0,,,
+https://ror.org/03x1z2w73,Laboratoire d'Ecologie Alpine,laboratory,1043062,0.0,,,
+https://ror.org/045ktmd28,Laboratoire national des champs magnétiques intenses - Grenoble,laboratory,1043255,0.0,,,
+https://ror.org/04qz4qy85,LAboratoire de Recherche Historique Rhône-Alpes - UMR5190,laboratory,1043116,0.0,,,
+https://ror.org/041rhpw39,Centre Hospitalier Universitaire de Grenoble,healthcare,,,4.0,include,
+https://ror.org/000tdrn36,Centre Interuniversitaire de MicroElectronique et Nanotechnologies,facility,,,2.0,include,
+https://ror.org/02wmc6m46,Institut d'Histoire des Représentations et des Idées dans les Modernités,facility,,,6.0,error,old UMR
+https://ror.org/01cf2sz15,Institut des Sciences de la Terre,facility,,,5.0,include,
+https://ror.org/01wwcfa26,Institute of Environmental Geosciences,facility,,,5.0,include,
+https://ror.org/03eqm6y13,LabEx PERSYVAL-Lab,facility,,,5.0,include,
+https://ror.org/0157h5t87,"Langages, Littératures, Sociétés. Études Transfrontalières et Internationales",facility,,,2.0,error,old UMR
+https://ror.org/02f7wz369,Pierre Mendès-France University,education,,,7.0,include,
+https://ror.org/03yppfm65,Stendhal University,education,,,5.0,include,
+https://ror.org/02rmwrd87,"Laboratoire Environnements, Dynamiques et Territoires de Montagne",facility,,,4.0,include,
+https://ror.org/047p7mf25,Institut Carnot PolyNat,facility,,,2.0,include,
+https://ror.org/03vte9x46,Observatoire des Sciences de l'Univers de Grenoble,education,,,5.0,include,
+https://ror.org/05c99vk74,Laboratoire de Recherche sur les Apprentissages en Contexte,facility,,,2.0,include,
+https://ror.org/04fhvpc68,Département de Pharmacochimie Moléculaire,facility,,,2.0,include,
+https://ror.org/03jrb0276,"Laboratoire Inter-universitaire de Psychologie: Personnalité, Cognition, Changement Social",facility,,,2.0,include,
+https://ror.org/01kbr1737,PHotonique ELectronique et Ingénierie QuantiqueS,facility,,,2.0,include,
+https://ror.org/00ndvqf03,Laboratoire Modélisation et Exploration des Matériaux,facility,,,2.0,include,
+https://ror.org/0467x8h16,Maison des Sciences de l'Homme-Alpes,facility,,,2.0,include,
+https://ror.org/02cmt9z73,Agence pour les Mathématiques en Interaction avec l'Entreprise et la Société,facility,,,2.0,exclude,GDR national
+https://ror.org/026m44z54,Laboratoire Nanotechnologies et Nanosystèmes,facility,,,6.0,include,
+https://ror.org/03949e763,Integrated Structural Biology Grenoble,facility,,,3.0,include,
+https://ror.org/04ndt7n58,"Centre d'Etudes et de Recherche sur la diplomatie, l’Administration Publique et le Politique",facility,,,2.0,include,
+https://ror.org/044cfnj78,GRIDCAD - Grenoble Alpes Recherche-Infrastructure de Calcul intensif et de Données,facility,,,4.0,include,
+https://ror.org/03e044190,Spintronique et Technologie des Composants,facility,,,4.0,include,
+https://ror.org/05hb8m595,PHOTOSYNTHESE,other,,,12.0,exclude,GDR national
+https://ror.org/01w1erp60,Couplage Multi-physiques et Multi-échelles en mécanique géo-environnemental,other,,,19.0,exclude,GDR national
+https://ror.org/05be9p317,Micropesanteur Fondamentale et Appliquée,other,,,14.0,exclude,GDR national
+https://ror.org/01x6z5t49,Fédération de Recherche Spectroscopies de Photoémission,other,,,42.0,exclude,FR national
+https://ror.org/01nrtdp55,GDR NBODY : Problème quantique à N corps en chimie et physique,facility,,,38.0,exclude,GDR national
+https://ror.org/00hgbrg14,"Fédération française Matériaux sous hautes vitesses de déformation. Application aux matériaux en conditions extrêmes, Procédés et structures",other,,,16.0,exclude,FR national
+https://ror.org/0459fdx51,Fédération de Recherche sur l'Energie Solaire,other,,,45.0,exclude,FR national
+https://ror.org/04yem5s35,INFRANALYTICS,other,,,16.0,exclude,IR national
+https://ror.org/01sgwka45,Microscopie Fonctionnelle du Vivant,facility,,,31.0,exclude,GDR national
--- a/0-collect-data/uga-find-ror-childs/UGA-ror-childs--2023-12-29.csv
+++ b/0-collect-data/uga-find-ror-childs/UGA-ror-childs--2023-12-29.csv
+ror,name,type,docid,hal_child_nb,ror_relation_nb
+https://ror.org/05588ks88,LInguistique et DIdactique des Langues Étrangères et Maternelles,laboratory,1043147,0.0,
+https://ror.org/01yxtfe92,École nationale supérieure d'architecture de Grenoble,institution,1043352,1.0,
+https://ror.org/05hz99a17,"Ambiances, Architectures, Urbanités",laboratory,1088635,0.0,
+https://ror.org/0509qp208,Centre d'études et de recherches appliquées à la gestion,laboratory,1043181,0.0,
+https://ror.org/05sbt2524,Institut polytechnique de Grenoble - Grenoble Institute of Technology,institution,1043329,15.0,
+https://ror.org/02z8yps18,"Laboratoire de Génie des Procédés pour la Bioraffinerie, les Matériaux Bio-sourcés et l'Impression Fonctionnelle",laboratory,1162332,0.0,
+https://ror.org/04eg25g76,Laboratoire de Conception et d'Intégration des Systèmes,laboratory,1043068,0.0,
+https://ror.org/02wrme198,Grenoble Images Parole Signal Automatique,laboratory,1043333,0.0,
+https://ror.org/043pfpy19,Laboratoire des Écoulements Géophysiques et Industriels [Grenoble],laboratory,1043048,0.0,
+https://ror.org/03985kf35,Translational Innovation in Medicine and Complexity / Recherche Translationnelle et Innovation en Médecine et Complexité - UMR 5525,laboratory,1043049,0.0,
+https://ror.org/03bcdsr62,"Laboratoire sols, solides, structures - risques [Grenoble]",laboratory,1043064,0.0,
+https://ror.org/04dbzz632,Institut Néel,laboratory,1043183,0.0,
+https://ror.org/05hyx5a17,Laboratoire de Génie Electrique de Grenoble,laboratory,1043220,0.0,
+https://ror.org/01c8rcg82,Laboratoire d'Informatique de Grenoble,laboratory,1043301,0.0,
+https://ror.org/05afmzm11,VERIMAG,laboratory,1043148,0.0,
+https://ror.org/04ett5b41,Laboratoire Jean Kuntzmann,laboratory,1043077,0.0,
+https://ror.org/03taa9n66,"Institut de Microélectronique, Electromagnétisme et Photonique - Laboratoire d'Hyperfréquences et Caractérisation",laboratory,1043224,0.0,
+https://ror.org/03f0apy98,Laboratoire de Physique Subatomique et de Cosmologie,laboratory,1043144,0.0,
+https://ror.org/00fwjkb59,Laboratoire d'Economie Appliquée de Grenoble,laboratory,1043256,0.0,
+https://ror.org/04axb9j69,Laboratoire d'Electrochimie et de Physico-chimie des Matériaux et des Interfaces,laboratory,1043074,0.0,
+https://ror.org/04as3rk94,[GIN] Grenoble Institut des Neurosciences,regrouplaboratory,408885,0.0,
+https://ror.org/036zswm25,Laboratoire des technologies de la microélectronique,laboratory,1043042,0.0,
+https://ror.org/05rwrfh97,Institut Fourier,laboratory,1043234,0.0,
+https://ror.org/026j45x50,"Pacte, Laboratoire de sciences sociales",laboratory,1043180,0.0,
+https://ror.org/023n9q531,Laboratoire Interdisciplinaire de Physique [Saint Martin d’Hères],laboratory,1043294,0.0,
+https://ror.org/04szabx38,Institut de biologie structurale,department,1043235,0.0,
+https://ror.org/02mc6qk71,Laboratoire de physique et modélisation des milieux condensés,laboratory,1043152,0.0,
+https://ror.org/02dpnb389,Clinatec - Centre de recherche biomédicale Edmond J.Safra,department,416269,0.0,
+https://ror.org/03x1z2w73,Laboratoire d'Ecologie Alpine,laboratory,1043062,0.0,
+https://ror.org/045ktmd28,Laboratoire national des champs magnétiques intenses - Grenoble,laboratory,1043255,0.0,
+https://ror.org/04qz4qy85,LAboratoire de Recherche Historique Rhône-Alpes - UMR5190,laboratory,1043116,0.0,
+https://ror.org/041rhpw39,Centre Hospitalier Universitaire de Grenoble,healthcare,,,4.0
+https://ror.org/000tdrn36,Centre Interuniversitaire de MicroElectronique et Nanotechnologies,facility,,,2.0
+https://ror.org/02wmc6m46,Institut d'Histoire des Représentations et des Idées dans les Modernités,facility,,,6.0
+https://ror.org/01cf2sz15,Institut des Sciences de la Terre,facility,,,5.0
+https://ror.org/01wwcfa26,Institute of Environmental Geosciences,facility,,,5.0
+https://ror.org/03eqm6y13,LabEx PERSYVAL-Lab,facility,,,5.0
+https://ror.org/0157h5t87,"Langages, Littératures, Sociétés. Études Transfrontalières et Internationales",facility,,,2.0
+https://ror.org/02f7wz369,Pierre Mendès-France University,education,,,7.0
+https://ror.org/03yppfm65,Stendhal University,education,,,5.0
+https://ror.org/02rmwrd87,"Laboratoire Environnements, Dynamiques et Territoires de Montagne",facility,,,4.0
+https://ror.org/047p7mf25,Institut Carnot PolyNat,facility,,,2.0
+https://ror.org/03vte9x46,Observatoire des Sciences de l'Univers de Grenoble,education,,,5.0
+https://ror.org/05c99vk74,Laboratoire de Recherche sur les Apprentissages en Contexte,facility,,,2.0
+https://ror.org/04fhvpc68,Département de Pharmacochimie Moléculaire,facility,,,2.0
+https://ror.org/03jrb0276,"Laboratoire Inter-universitaire de Psychologie: Personnalité, Cognition, Changement Social",facility,,,2.0
+https://ror.org/01kbr1737,PHotonique ELectronique et Ingénierie QuantiqueS,facility,,,2.0
+https://ror.org/00ndvqf03,Laboratoire Modélisation et Exploration des Matériaux,facility,,,2.0
+https://ror.org/0467x8h16,Maison des Sciences de l'Homme-Alpes,facility,,,2.0
+https://ror.org/02cmt9z73,Agence pour les Mathématiques en Interaction avec l'Entreprise et la Société,facility,,,2.0
+https://ror.org/026m44z54,Laboratoire Nanotechnologies et Nanosystèmes,facility,,,6.0
+https://ror.org/03949e763,Integrated Structural Biology Grenoble,facility,,,3.0
+https://ror.org/04ndt7n58,"Centre d'Etudes et de Recherche sur la diplomatie, l’Administration Publique et le Politique",facility,,,2.0
+https://ror.org/044cfnj78,GRIDCAD - Grenoble Alpes Recherche-Infrastructure de Calcul intensif et de Données,facility,,,4.0
+https://ror.org/03e044190,Spintronique et Technologie des Composants,facility,,,4.0
+https://ror.org/05hb8m595,PHOTOSYNTHESE,other,,,12.0
+https://ror.org/01w1erp60,Couplage Multi-physiques et Multi-échelles en mécanique géo-environnemental,other,,,19.0
+https://ror.org/05be9p317,Micropesanteur Fondamentale et Appliquée,other,,,14.0
+https://ror.org/01x6z5t49,Fédération de Recherche Spectroscopies de Photoémission,other,,,42.0
+https://ror.org/01nrtdp55,GDR NBODY : Problème quantique à N corps en chimie et physique,facility,,,38.0
+https://ror.org/00hgbrg14,"Fédération française Matériaux sous hautes vitesses de déformation. Application aux matériaux en conditions extrêmes, Procédés et structures",other,,,16.0
+https://ror.org/0459fdx51,Fédération de Recherche sur l'Energie Solaire,other,,,45.0
+https://ror.org/04yem5s35,INFRANALYTICS,other,,,16.0
+https://ror.org/01sgwka45,Microscopie Fonctionnelle du Vivant,facility,,,31.0
--- a/0-collect-data/uga-find-ror-childs/ror-retrieve-uga-rors.py
+++ b/0-collect-data/uga-find-ror-childs/ror-retrieve-uga-rors.py
+# Récupérer les RORs de l'env. UGA
+## 2024-01-08, Maxence Larrieu
+
+
+"""
+## step
+- recupere les ROR qui ont pour parent le ROR de l'UGA
+- HAL : on réduit aux structures VALID
+- ROR apporte du bruit et demande un nettoyage manuel a posteriori
+- ROR : on retire les UMR passées et les réseaux nationaux type GDR, FR, IR
+- cette dernière étape se fait à la main à partir des tableaux produits
+
+
+## documentation
+
+* HAL
+https://api.archives-ouvertes.fr/ref/structure/?q=parentRor_id:02rx3b187
+
+
+* ROR 
+https://ror.readme.io/docs/ror-schema-api-v2-beta
+https://api.dev.ror.org/v2/organizations/02rx3b187
+"""
+
+
+import requests, json, pandas as pd
+import datetime
+
+
+## ___0____ from HAL structure API get UGA ROR childs
+
+def hal_struct_return_docs(ror_id) : 
+	"""
+	HAL structure referentiel
+	https://api.archives-ouvertes.fr/docs/ref/?resource=structure&schema=fields#fields
+	list structures who have a parent corresponding to ror_id
+	output raw results inside the "docs" key
+	"""
+
+	# query human language : childs of ROR structure that are VALID and has a ROR id
+	req_filter = f"q=parentRor_id:{ror_id}&fq=ror_s:[%22%22%20TO%20*]\
+	&fq=valid_s:VALID&fl=ror_s,name_s,type_s,docid"
+
+	req = requests.get("https://api.archives-ouvertes.fr/ref/structure/?" + req_filter + "&rows=500")
+	# to debug print(req.url)
+	res = req.json()
+	return res["response"]["docs"]
+
+
+def hal_struct_return_ror_childs(ror_id) :
+	"""
+	boucle récursive
+	à partir d'un résultat de HAL structure sur parent_ror
+	itérer sur chacun des éléments et les ajouter
+	et refaire pareil sur ces éléments
+	"""	
+	hal_results = hal_struct_return_docs(ror_id)
+
+	for item in hal_results : 
+		# in HAL structure API, ror are in list
+		for ror_url in item["ror_s"] :
+
+			## find number of ROR child to the sub element
+			ror_id =  convert_ror_url_to_id(ror_url)
+			new_hal_results = hal_struct_return_docs(ror_id)
+			nb_childs = len(new_hal_results)
+			struct_info = [ror_url, item["name_s"], item["type_s"], item["docid"], nb_childs]
+			
+			ror_childs.append(struct_info)
+
+			## if sub element has ROR child lets iterate on these new results
+			if nb_childs > 0 :
+				new_ror_id = convert_ror_url_to_id(ror_url)
+				hal_struct_return_ror_childs(new_ror_id)
+
+
+def convert_ror_url_to_id (ror_url) : 
+	# convert https://ror.org/05rwrfh97 to 05rwrfh97
+	## car l'API de HAL renoie l'URL pour ces deux champs ror_s et ror_urls :/ 
+	return ror_url[ ror_url.rfind("/") +1 :]
+
+
+ror_childs = []
+hal_struct_return_ror_childs("02rx3b187")
+
+print(f"from HAL, nb of childs raw\t{len(ror_childs)}")
+
+df_hal = pd.DataFrame(ror_childs, columns = ["ror", "name", "type", "docid", "hal_child_nb"])
+df_hal.drop_duplicates(subset = ["ror"], inplace = True)
+print(f"from HAL, nb of unique childs\t{len(df_hal)}")
+
+
+## ____2____  from ROR API get UGA ROR childs
+
+def ror_get_child_info(ror_id) : 
+	req = requests.get("https://api.dev.ror.org/v2/organizations/" + ror_id)
+	res = req.json()
+	types_list = [ item for item in res["types"]]
+	org_type = ",".join(types_list)
+	return {
+		"org_type" : org_type,
+		"relation_nb" : len(res["relationships"])
+		}
+
+# query on ROR w UGA ROR_id
+req = requests.get("https://api.dev.ror.org/v2/organizations/02rx3b187")
+res = req.json()
+# relationships contains all ror "childs"
+ror_list = res["relationships"]
+print(f"from ROR, nb of childs finded\t{len(ror_list)}")
+
+## for each ROR finded enrich with ROR API
+data_from_rors = []
+for item in ror_list :
+	if item["id"] not in df_hal.ror.values : 
+		org_info = ror_get_child_info(item["id"])
+		data_from_rors.append( 
+			[ item["id"], item["label"], org_info["org_type"], org_info["relation_nb"] ]
+		)
+
+
+df_ror = pd.DataFrame(data_from_rors, columns = ["ror", "name", "type", "ror_relation_nb"])
+
+df = pd.concat([df_hal, df_ror], ignore_index = True)
+date = datetime.datetime.now().strftime("%Y-%m-%d") ## format date has 2023-12-29
+df.to_csv(f"UGA-ror-childs--{date}.csv", index = False)
+
+
+print("nb of RORs finded", len(df))
--- a/0-collect-data/z-datacite-demo.ipynb
+++ b/0-collect-data/z-datacite-demo.ipynb
+%% Cell type:markdown id:ce5cebe1-a058-4c0f-a5b5-23a02baa3521 tags:
+
+# Démo récupération des DOIs affiliés à l'UGA depuis Datacite
+
+## Documentation
+* Doc générale API Datacite : https://support.datacite.org/docs/api
+* Recherche d'un DOI : https://support.datacite.org/docs/api-sampling
+* Pagination : https://support.datacite.org/docs/pagination
+
+## Code
+
+%% Cell type:code id:d229201d-4e79-40a8-9472-9ea46b344b1c tags:
+
+``` python
+import requests, json, pandas
+```
+
+%% Cell type:code id:7ce19b89-d5b7-4dbe-9fab-a15a81b42078 tags:
+
+``` python
+# construction de l'url
+url_pre = 'https://api.datacite.org/dois?affiliation=true&page[size]=1000'
+url_query = '&query=(creators.affiliation.affiliationIdentifier:"https://ror.org/02rx3b187") AND (types.resourceTypeGeneral:Dataset)'
+
+# obtention des résultats de la requête
+req = requests.get(url_pre + url_query)
+results = req.json()
+```
+
+%% Cell type:code id:ec73be41-21df-4448-b58e-e21306f6b9fa tags:
+
+``` python
+# ajouter les DOIs dans une liste, sans ajouter les résultats qui ne sont pas des DOIs
+# ajouter les autres résultats qui nous intéressent dans des listes afin de construire un DataFrame
+dois = []
+not_dois = []
+titles = []
+dates = []
+authors = []
+
+# boucler pour ajouter les informations relatives à chaque dépôt
+all_md_list = results["data"]
+nb_dois = len(all_md_list)
+
+for item in all_md_list :
+    doi = item["id"]
+    # si l'identifiant n'est pas un doi, on ne le prend pas
+    if item.get("type") != "dois":
+        print("Le résultat " + str(item) + " est de type " + item.get("type") + " : " + doi)
+        not_dois.append(doi)
+    # sinon, on récupère les informations dont on a besoin
+    else:
+        dois.append(doi)
+        titles.append(item["attributes"]["titles"][0].get("title"))
+        dates.append(item["attributes"]["created"])
+        # boucler pour obtenir tous les auteurs
+        auts = []
+        for aut in item["attributes"]["creators"]:
+            auts.append(aut.get("name"))
+        authors.append(auts)
+
+# affichage du résultat
+print("Nombre de résultats trouvés : " + str(nb_dois))
+```
+
+%% Output
+
+    Nombre de résultats trouvés : 142
+
+%% Cell type:code id:e0722b5b-0059-4842-8e8a-2125239a7b7d tags:
+
+``` python
+# construction du DataFrame
+df = pandas.DataFrame({'DOI':dois, 'Titre':titles, 'Date':dates, 'Auteurs':authors})
+
+df.to_csv("z-resultats-demo-datacite.csv")
+df
+```
+
+%% Output
+
+                                                  DOI  \
+    0                                  10.7280/d11h3x
+    1                                  10.7280/d1mm37
+    2                                  10.7280/d1595v
+    3                                  10.7280/d1667w
+    4                                  10.7280/d1b114
+    ..                                            ...
+    137                  10.6084/m9.figshare.23488967
+    138                               10.18150/wyyjk6
+    139                               10.13127/efsm20
+    140  10.5285/3ea504d8-41c2-40dc-86dc-284c341badaa
+    141  10.5285/634ee206-258f-4b47-9237-efff4ef9eedd
+    
+                                                     Titre                  Date  \
+    0    Annual Ice Velocity of the Greenland Ice Sheet...  2019-03-29T12:53:36Z
+    1    Annual Ice Velocity of the Greenland Ice Sheet...  2018-12-14T09:39:45Z
+    2    Annual Ice Velocity of the Greenland Ice Sheet...  2019-03-29T10:37:23Z
+    3    Greenland Marine-Terminating Glacier Retreat Data  2020-12-01T18:09:19Z
+    4    Dataset for: Fast retreat of Pope, Smith, and ...  2021-11-01T23:46:08Z
+    ..                                                 ...                   ...
+    137  Additional file 1 of 3DVizSNP: a tool for rapi...  2023-06-10T03:21:52Z
+    138  Estimates for recombination coefficients from ...  2022-04-21T14:17:28Z
+    139  European Fault-Source Model 2020 (EFSM20): onl...  2022-10-30T16:28:46Z
+    140  Ice radar data from Little Dome C, Antarctica,...  2022-03-04T09:26:18Z
+    141  Polarimetric ApRES data on a profile across Do...  2021-09-16T11:17:15Z
+    
+                                                   Auteurs
+    0    [Mouginot, Jeremie, Rignot, Eric, Scheuchl, Be...
+    1    [Mouginot, Jeremie, Rignot, Eric, Millan, Roma...
+    2    [Mouginot, Jeremie, Rignot, Eric, Scheuchl, Be...
+    3    [Wood, Michael, Rignot, Eric, Bjørk, Anders, V...
+    4    [Milillo, Pietro, Rignot, Eric, Rizzoli, Paola...
+    ..                                                 ...
+    137  [Sierk, Michael, Ratnayake, Shashikala, Wagle,...
+    138  [Sakowski, Konrad, Borowik, Lukasz, Rochat, Né...
+    139  [Basili, Roberto, Danciu, Laurentiu, Beauval, ...
+    140  [Mulvaney, Robert, King, Edward, Martin, Carlo...
+    141      [Corr, Hugh, Ritz, Catherine, Martin, Carlos]
+    
+    [142 rows x 4 columns]
+
+%% Cell type:code id:0308feea-2560-4e33-836a-285a65db2429 tags:
+
+``` python
+```
+%% Cell type:markdown id:ce5cebe1-a058-4c0f-a5b5-23a02baa3521 tags:
+
+# Démo récupération des DOIs affiliés à l'UGA depuis Datacite
+
+## Documentation
+* Doc générale API Datacite : https://support.datacite.org/docs/api
+* Recherche d'un DOI : https://support.datacite.org/docs/api-sampling
+* Pagination : https://support.datacite.org/docs/pagination
+
+## Code
+
+%% Cell type:code id:d229201d-4e79-40a8-9472-9ea46b344b1c tags:
+
+``` python
+import requests, json, pandas
+```
+
+%% Cell type:code id:7ce19b89-d5b7-4dbe-9fab-a15a81b42078 tags:
+
+``` python
+# construction de l'url
+url_pre = 'https://api.datacite.org/dois?affiliation=true&page[size]=1000'
+url_query = '&query=(creators.affiliation.affiliationIdentifier:"https://ror.org/02rx3b187") AND (types.resourceTypeGeneral:Dataset)'
+
+# obtention des résultats de la requête
+req = requests.get(url_pre + url_query)
+results = req.json()
+```
+
+%% Cell type:code id:ec73be41-21df-4448-b58e-e21306f6b9fa tags:
+
+``` python
+# ajouter les DOIs dans une liste, sans ajouter les résultats qui ne sont pas des DOIs
+# ajouter les autres résultats qui nous intéressent dans des listes afin de construire un DataFrame
+dois = []
+not_dois = []
+titles = []
+dates = []
+authors = []
+
+# boucler pour ajouter les informations relatives à chaque dépôt
+all_md_list = results["data"]
+nb_dois = len(all_md_list)
+
+for item in all_md_list :
+    doi = item["id"]
+    # si l'identifiant n'est pas un doi, on ne le prend pas
+    if item.get("type") != "dois":
+        print("Le résultat " + str(item) + " est de type " + item.get("type") + " : " + doi)
+        not_dois.append(doi)
+    # sinon, on récupère les informations dont on a besoin
+    else:
+        dois.append(doi)
+        titles.append(item["attributes"]["titles"][0].get("title"))
+        dates.append(item["attributes"]["created"])
+        # boucler pour obtenir tous les auteurs
+        auts = []
+        for aut in item["attributes"]["creators"]:
+            auts.append(aut.get("name"))
+        authors.append(auts)
+
+# affichage du résultat
+print("Nombre de résultats trouvés : " + str(nb_dois))
+```
+
+%% Output
+
+    Nombre de résultats trouvés : 142
+
+%% Cell type:code id:e0722b5b-0059-4842-8e8a-2125239a7b7d tags:
+
+``` python
+# construction du DataFrame
+df = pandas.DataFrame({'DOI':dois, 'Titre':titles, 'Date':dates, 'Auteurs':authors})
+
+df.to_csv("z-resultats-demo-datacite.csv")
+df
+```
+
+%% Output
+
+                                                  DOI  \
+    0                                  10.7280/d11h3x
+    1                                  10.7280/d1mm37
+    2                                  10.7280/d1595v
+    3                                  10.7280/d1667w
+    4                                  10.7280/d1b114
+    ..                                            ...
+    137                  10.6084/m9.figshare.23488967
+    138                               10.18150/wyyjk6
+    139                               10.13127/efsm20
+    140  10.5285/3ea504d8-41c2-40dc-86dc-284c341badaa
+    141  10.5285/634ee206-258f-4b47-9237-efff4ef9eedd
+    
+                                                     Titre                  Date  \
+    0    Annual Ice Velocity of the Greenland Ice Sheet...  2019-03-29T12:53:36Z
+    1    Annual Ice Velocity of the Greenland Ice Sheet...  2018-12-14T09:39:45Z
+    2    Annual Ice Velocity of the Greenland Ice Sheet...  2019-03-29T10:37:23Z
+    3    Greenland Marine-Terminating Glacier Retreat Data  2020-12-01T18:09:19Z
+    4    Dataset for: Fast retreat of Pope, Smith, and ...  2021-11-01T23:46:08Z
+    ..                                                 ...                   ...
+    137  Additional file 1 of 3DVizSNP: a tool for rapi...  2023-06-10T03:21:52Z
+    138  Estimates for recombination coefficients from ...  2022-04-21T14:17:28Z
+    139  European Fault-Source Model 2020 (EFSM20): onl...  2022-10-30T16:28:46Z
+    140  Ice radar data from Little Dome C, Antarctica,...  2022-03-04T09:26:18Z
+    141  Polarimetric ApRES data on a profile across Do...  2021-09-16T11:17:15Z
+    
+                                                   Auteurs
+    0    [Mouginot, Jeremie, Rignot, Eric, Scheuchl, Be...
+    1    [Mouginot, Jeremie, Rignot, Eric, Millan, Roma...
+    2    [Mouginot, Jeremie, Rignot, Eric, Scheuchl, Be...
+    3    [Wood, Michael, Rignot, Eric, Bjørk, Anders, V...
+    4    [Milillo, Pietro, Rignot, Eric, Rizzoli, Paola...
+    ..                                                 ...
+    137  [Sierk, Michael, Ratnayake, Shashikala, Wagle,...
+    138  [Sakowski, Konrad, Borowik, Lukasz, Rochat, Né...
+    139  [Basili, Roberto, Danciu, Laurentiu, Beauval, ...
+    140  [Mulvaney, Robert, King, Edward, Martin, Carlo...
+    141      [Corr, Hugh, Ritz, Catherine, Martin, Carlos]
+    
+    [142 rows x 4 columns]
+
+%% Cell type:code id:0308feea-2560-4e33-836a-285a65db2429 tags:
+
+``` python
+```
--- a/0-collect-data/zenodo-dois.txt
+++ b/0-collect-data/zenodo-dois.txt
--- a/0-collect-data/zenodo.py
+++ b/0-collect-data/zenodo.py
 # Récupérer les DOIs de l'UGA depuis Zenodo
-## 2023-11-29, Maxence Larrieu
+## 2023-12-06, Maxence Larrieu


 """
 ## todo
- recherche itératives par types : dataset, image, video, software, physical object, other OU bien regarder si on requeter aussi dans le champs type directment
+- v2 : rechercher UGA comme financeur `grants.funder.doi`

 ## Documentation
 * Liste des métadonnées obligatoires lors du dépôts (upload_type, sub_type, publication_date, titre, creators, ) https://developers.zenodo.org/#representation
-* Recherche dans l'interfance : https://zenodo.org/search?q=creators.affiliation%3Agrenoble&q=type%3Adataset&l=list&p=1&s=10&sort=bestmatch
+* Recherche dans l'interface : https://zenodo.org/search?q=creators.affiliation%3Agrenoble&q=type%3Adataset&l=list&p=1&s=10&sort=bestmatch
 * Doc dev API champs de requêtes classsiques https://developers.zenodo.org/#records
 * doc champs poussées pour la recherche https://help.zenodo.org/guides/search/
 * typologie des dépôts possiblent : publication: Publication, poster: Poster, presentation: Presentation, Dataset: Dataset, image: Image, video: Video/Audio, software: Software, lesson: Lesson, physicalobject: Physical object, other: Other
+* descendre au niveau des ORCID des `creator.orcid` et `contributors.orcid`


-### Identifier les dépôts : 
- utilisation du champs `creators.affiliation` (multi affiliation)
- v2 : chercher aussi au niveau des "contributeurs" : `contributors.affiliation` et au niveau auteur `contributors.orcid`
- v2 : passer par les ORCID des 'auteurs' `creator.orcid`
- v2 : rechercher UGA comme financeur `grants.funder.doi`
-
 ## Notes sur la récupération
- exemple résultats de requete : https://zenodo.org/api/records?q=creators.affiliation%3A*grenoble*&type=dataset&page=6&size=100&sort=mostrecent&all_version=False&access_tpoken=4A1P6e4gIkvhpcNpVpI05C9yVCjJIDD3vljILoJ1wlyxT9VA70ZXyPMdjqHB)
- deux DOI identiques sont présent : un à la racine `[hits][doi]` et un autre dans `[hits][metadata][doi]`
+- exemple résultats de requete : https://zenodo.org/api/records?q=creators.affiliation%3A*grenoble*&type=dataset&page=6&size=100&sort=mostrecent&all_version=False
+- deux DOI identiques sont présents : un à la racine `[hits][doi]` et un autre dans `[hits][metadata][doi]`
 - il y a des DOI renseignés par le déposant qui ne sont pas de Zenodo
 """

-import pandas as pd, requests, json
+import requests, json

-parent_folder = "0-collect-data"
-print("\n\nzenodo.py is launched")
+print("\n\nRunning zenodo.py")

-## import API key. Recquired to req Zenodo API
-with open(parent_folder + "/personnal-keys.json") as f : 
-    ACCESS_TOKEN = json.load(f)["ZENODO_KEY"]
-
-def req_zenodo(page_nb) :
+def req_zenodo_with_page(uga_perimeter, record_type, page_nb) :
    """
    retourne les jeux de données UGA depuis Zenodo
    page_nb précise le numéro de la page de la requête
@@ -44,16 +34,16 @@ def req_zenodo(page_nb) :
    r = requests.get(
        "https://zenodo.org/api/records",
        params = {
-            "q" : "creators.affiliation:grenoble",
-            "type" : "dataset",
+            "q" : uga_perimeter,
+            "type" : record_type,
            "page" : page_nb, 
            "size" : 100,
            "sort" : "mostrecent",
            "all_version" : False,
-            "access_tpoken" : ACCESS_TOKEN
            } 
    )
-    # to debug print(r.url)
+    # for debugging 
+    #print(r.url)

    ## si pb présent dans les résultats
    if r.status_code != 200 : 
@@ -62,7 +52,7 @@ def req_zenodo(page_nb) :
            "content" : r
            }

-    ## s'il n'y a pas de probleme
+    ## if no problem
    res = r.json()
    
    return {
@@ -71,40 +61,74 @@ def req_zenodo(page_nb) :
        }


-## var to initiate the loop
-output_dois = []
-current_page_number = 0
-iterate = True
+def req_zenodo(uga_perimeter, record_type) : 
+    """
+    permet de gérer la pagination de zenodo en fonction du nb de résutlat donné par l'API
+    tant que le nombre de DOI récolté est inf. au résutlat de la requête
+    """

-while iterate :
-    
-    current_page_number += 1
-    print("new iteration page =", current_page_number)
-    res = req_zenodo(current_page_number)
+    output_dois = []
+    current_page_number = 0
+    iterate = True
+
+    while iterate :
+
+        current_page_number += 1
+        res = req_zenodo_with_page(uga_perimeter, record_type, current_page_number)
+
+        # verifier si la requete a bien fonctionnée
+        if not res["continue"] : 
+            print(r"\t/!\ oups, problem with the query")
+            print(res["content"].status_code ) ## contient la réponse de la requête
+            break
+
+        # pour la premiere iteration, extraire le nb de DOI a récupérer
+        if current_page_number == 1 : 
+            nb_dois_to_get = res["content"]["total"]
+            print(f"\t{record_type}\n\tDOIs to get", nb_dois_to_get )

-    # verifier si la requete a bien fonctionnée
-    if not res["continue"] : 
-        print("oups, problem with the query")
-        print(res["content"].status_code ) ## res[content] contient la réponse de la requête
-        break
+        #to debug
+        #print("\titeration page nb", current_page_number)
+        
+        ## parcourir tous les DOIs et les placer dans la liste
+        ## placer le DOI uniquement si une donnée est présente
+        [output_dois.append( item["doi"] ) for item in res["content"]["hits"] if item.get("doi")]

-    # pour la premiere iteration, extraire le nb de DOI a récupérer
-    if current_page_number == 1 : 
-        nb_dois_to_get = res["content"]["total"]
-        print("\tfirst iteration. DOIs to get =", nb_dois_to_get )
+        ## managing the loop
+        ### if I have all the DOIs inside my outputs, then I had finish, if not continue
+        if len(output_dois) >= nb_dois_to_get : 
+            iterate = False

-    ## parcourir tous les DOIs et les placer dans la liste
-    ## placer le DOI uniquement si une donnée est présente
-    [output_dois.append( item.get("doi") ) for item in res["content"]["hits"] if item.get("doi")]
+    return output_dois

-    ## managing the loop
-    ### if I have all the DOIs inside my outputs, then I had finish
-    if len(output_dois) >= nb_dois_to_get : 
-        iterate = False

+##_______________________________________________
+
+all_dois = set() # a set to gather all DOIs
+
+uga_txt_query = "(\"grenoble alpes\" OR \"grenoble alps\" OR \"grenoble INP\" \
+OR \"polytechnique de grenoble\" OR \"Grenoble Institute of Technology\" OR \"univeristé de grenoble\" )"
+
+uga_query = f"creators.affiliation:{uga_txt_query} contributors.affiliation:{uga_txt_query}"
+
+## memo 2024-02 two fields following by space will search in first field or in second field
+## ne pas faire de recherche avec AND car ça recherche dans toutes les affiliations des auteurs
+## SceincesPo Grenoble n'apporte pas de résultat https://zenodo.org/search?q=creators.affiliation%3A%28sciencespo%20AND%20Grenoble%29&q=type%3Adataset&l=list&p=1&s=10&sort=bestmatch
+
+types_to_req = ["dataset", "image", "video", "software", "other"]
+
+for record_type in types_to_req : 
+
+    temp_dois = req_zenodo(uga_query, record_type)
+
+    ## placer les DOI dans le buffer général qui ne peut contenir que des valeurs uniques
+    [all_dois.add(doi) for doi in temp_dois]
+    
+
+print("\n\tnb DOIs finded\t",  len(all_dois) )

 ## ____z____ exporter la liste de DOI au format txt

-with open(parent_folder + "/zenodo-dois.txt", 'w') as f :
-    [f.write(f"{line}\n") for line in output_dois]
+with open("zenodo-dois.txt", 'w') as f :
+    [f.write(f"{line}\n") for line in all_dois]

--- a/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
+++ b/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
+client,count,name,year,url
+cern.zenodo,894,Zenodo,2013,https://zenodo.org/
+inist.sshade,527,Solid Spectroscopy Hosting Architecture of Databases and Expertise,2019,https://www.sshade.eu/
+figshare.ars,380,figshare Academic Research System,2016,http://figshare.com/
+inist.osug,275,Observatoire des Sciences de l'Univers de Grenoble,2014,http://doi.osug.fr
+dryad.dryad,168,DRYAD,2018,https://datadryad.org
+inist.resif,101,Réseau sismologique et géodésique français,2014,https://www.resif.fr/
+rdg.prod,84,Recherche Data Gouv France,2022,https://recherche.data.gouv.fr/en
+inist.humanum,75,NAKALA,2020,https://nakala.fr
+inist.persyval,64,PERSYVAL-Lab : Pervasive Systems and Algorithms Lab,2016,
+fmsh.prod,28,Fondation Maison des sciences de l'homme,2023,
+inist.ccj,22,Centre Camille Jullian – UMR 7299,2020,
+pangaea.repository,18,PANGAEA,2020,https://www.pangaea.de/
+mcdy.dohrmi,14,dggv-e-publications,2020,https://www.dggv.de/publikationen/dggv-e-publikationen.html
+inist.cirm,7,Centre International de Rencontres Mathématiques,2017,
+figshare.sage,6,figshare SAGE Publications,2018,
+iris.iris,5,NSF Seismological Facility for the Advancement of Geoscience (SAGE),2018,http://www.iris.edu/hq/
+tib.repod,4,RepOD,2015,https://repod.icm.edu.pl/
+vqpf.dris,3,Direction des ressources et de l'information scientifique,2021,
+tib.gfzbib,3,GFZpublic,2011,https://gfzpublic.gfz-potsdam.de
+cnic.sciencedb,3,ScienceDB,2022,https://www.scidb.cn/en
+inist.eost,2,Ecole et Observatoire des Sciences de la Terre,2017,https://eost.unistra.fr/en/
+tib.gfz,2,GFZ Data Services,2011,https://dataservices.gfz-potsdam.de/portal/
+bl.mendeley,2,Mendeley Data,2015,https://data.mendeley.com/
+bl.nerc,2,NERC Environmental Data Service,2011,https://eds.ukri.org
+tug.openlib,2,TU Graz OPEN Library,2020,https://openlib.tugraz.at/
+crui.ingv,2,Istituto Nazionale di Geofisica e Vulcanologia (INGV),2013,http://data.ingv.it/
+ugraz.unipub,2,unipub,2019,http://unipub.uni-graz.at
+ethz.sed,2,"Swiss Seismological Service, national earthquake monitoring and hazard center",2013,http://www.seismo.ethz.ch
+inist.opgc,1,Observatoire de Physique du Globe de Clermont-Ferrand,2017,
+ethz.da-rd,1,ETHZ Data Archive - Research Data,2013,http://data-archive.ethz.ch
+ethz.zora,1,"Universität Zürich, ZORA",2013,https://www.zora.uzh.ch/
+estdoi.ttu,1,TalTech,2019,https://digikogu.taltech.ee
+repod.dbuw,1,University of Warsaw Research Data Repository,2023,https://danebadawcze.uw.edu.pl/
+inist.ird,1,IRD,2016,
+inist.omp,1,Observatoire Midi-Pyrénées,2011,
+umass.uma,1,University of Massachusetts (UMass) Amherst,2018,https://scholarworks.umass.edu/
+edi.edi,1,Environmental Data Initiative,2017,https://portal.edirepository.org/nis/home.jsp
+bl.iita,1,International Institute of Tropical Agriculture datasets,2017,http://data.iita.org/
+ardcx.nci,1,National Computational Infrastructure,2020,
+ihumi.pub,1,IHU Méditerranée Infection,2020,
+inist.inrap,1,Institut national de recherches archéologiques préventives,2019,
+tib.mpdl,1,Max Planck Digital Library,2015,
+tudublin.arrow,1,ARROW@TU Dublin,2020,https://arrow.dit.ie/
--- a/1-enrich-with-datacite/concatenate-enrich-dois.py
+++ b/1-enrich-with-datacite/concatenate-enrich-dois.py
+import z_personal_functions as my_functions
+import requests, json, random, pandas as pd
+
+
+print("\n\nRunning concatenate-enrich-dois.py")
+
+# ______0______ load DOIs and remove duplicate
+
+## specifier la liste des entrepôts à importer
+files_to_load = [ "zenodo", "datacite", "rdg", "bso-via-hal", "nakala" ] 
+
+dois_raw = my_functions.from_files_load_dois(files_to_load)
+print("\n\tDOIs loaded\t\t\t", len(dois_raw))
+
+dois = list(set(dois_raw)) ## remove duplicate
+print("\tDOIs to treat\t\t", len(dois))
+
+
+# ______1_____ load DOIs already treater & get md from DataCite for new one
+
+## pour essayer avec un seul DOI
+# temp_doi = dois[random.randint(0, len(dois))]
+# temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509
+# print(temp_doi)
+# raw_metadatas = my_functions.get_md_from_datacite(temp_doi)
+
+doi_error = [] # retrieve doi error
+temp_rows = [] # put data in dict before df
+
+df_old = pd.read_csv("../dois-uga.csv")
+
+print(f"\n\tnb of DOIs already treated\t{len(df_old)}")
+
+# req dataCite and paste data following instructions
+for doi in dois : #[:300]
+
+	## if doi already treated
+	if doi in df_old["doi"].values : 
+		#print(f"\talready treated\t\t{doi}")
+		continue
+
+	## ___n___ get md from datacite
+	raw_md = my_functions.get_md_from_datacite(doi)
+	## to debug print(f"\t{doi}")
+
+	### if doi not in datacite
+	if raw_md == "error" : 
+		doi_error.append(doi)
+		continue
+
+	## ___n___ from manual instructions retrieve appropriate data
+	selected_md = my_functions.parse_datacite_md(raw_md) ## placer les resultats dans un dictionnaire
+	temp_rows.append(selected_md) ## ajouter ce dictionnaire à une liste
+	print(f"\tadded\t\t{doi}")
+
+## if new datasets has been founded
+if temp_rows :
+	df_fresh = pd.DataFrame(temp_rows)
+	dois_added = list(df_old["doi"])
+	to_del = []
+	for i in range(0, len(df_fresh)):
+		result = my_functions.get_origin_version(df_fresh.loc[i, "doi"])
+		if result[0] not in dois_added: 
+			dois_added.append(result[0])
+			df_fresh.loc[i, "doi"] = result[0]
+			if str(result[1]) != "[]": df_fresh.loc[i, "traveled_dois"] = str(result[1])
+			else: df_fresh.loc[i, "traveled_dois"] = ""	
+			if str(result[2]) != "[]": df_fresh.loc[i, "all_relations"] = str(result[2])
+			else: df_fresh.loc[i, "all_relations"] = ""
+		else:
+			to_del.append(i)
+			
+	df_fresh.drop(to_del, inplace=True)
+	print("Nombre de dois supprimés : " + str(len(to_del)))
+	
+	print("Nb dois a garder : " + str(len(dois_added)))
+
+	df_concat = pd.concat([df_old, df_fresh], ignore_index=True)
+
+	## remove not wanted datacite type & clients
+	type_to_explude = ["Book", "ConferencePaper", "ConferenceProceeding", "JournalArticle", "BookChapter", "Service", "Preprint"]
+	clients_to_exclude = ["rg.rg", "inist.epure"]
+
+	df_out = df_concat[ ~df_concat["resourceTypeGeneral"].isin(type_to_explude) & ~df_concat["client"].isin(clients_to_exclude) ].copy()
+	
+	## output main CSV
+	df_out.to_csv("../dois-uga.csv", index = False)
+	print(f"\n\nnb of doi exported \t{len(df_out)}")
+
+
+	# write the number of dois found in a file to display on the website
+	with open("nb-dois.txt", 'w') as outf :
+		outf.write(str(len(df_out)))
+
+
+	## output last 500 DOIs to make it easier to open in web tools
+	df_last_dois = df_out.sort_values(by = "created", ascending = False, inplace = False)[:500]
+	df_last_dois["created"] = df_last_dois["created"].str[:10]
+	df_last_dois[["doi", "client", "resourceTypeGeneral", "created", "publisher", "rights", "sizes"]].to_csv("../dois-uga--last-500.csv", index = False)
+
+
+	## for the website : output another csv with datacite client and number of datasets
+	df_client_raw = df_out["client"].value_counts().to_frame()
+
+	## get informations about each client
+	client_names = []
+	client_years = []
+	client_urls = []
+	for i in range(0, len(df_client_raw)):
+		client = df_client_raw.iloc[i].name
+		req = requests.get('https://api.datacite.org/clients?query=id:%22'+str(client)+'%22')
+		client_names.append(req.json()["data"][0]["attributes"]["name"])
+		client_years.append(req.json()["data"][0]["attributes"]["year"])
+		client_urls.append(req.json()["data"][0]["attributes"]["url"])
+	
+	## add informations to the output csv
+	df_client_raw["name"] = client_names
+	df_client_raw["year"] = client_years
+	df_client_raw["url"] = client_urls
+	df_client_raw.to_csv("all_datacite_clients_for_uga.csv")
+
+
--- a/1-enrich-with-datacite/datacite-parser-instructions.json
+++ b/1-enrich-with-datacite/datacite-parser-instructions.json
+{
+	"title" : "explique les chemins et les champs des données de datacite à récupérer",
+	"version" : "2023-11-20",
+	"comentaires" : {
+		"0" : "non prise en compte de l'attribut dates",
+		"1" : "non prise en compte de l'attribut relatedIdentifiers"
+	},
+	"path-and-fields" : {
+
+		"attributes" : {
+			"titles" : {
+				"type" : "list of dict",
+				"past_first_occ" : "title"
+			},
+			"publisher" : "string",
+			"publicationYear" : "int",
+			"subjects" : {
+				"past_values_w_this_key" : "subject",
+				"flatten_all_in_this_key" : "subject_raw"
+			},
+			"language" : "string",
+			"types" : {
+				"type" : "dict",
+				"past_values_w_this_key" : "resourceTypeGeneral"
+			},
+			"sizes" : "string",
+			"formats" : "string",
+			"rightsList" : {
+				"type" : "list of dict",
+				"past_values_w_this_key" :"rights"
+			},
+			"descriptions" : {
+				"type" : "list of dict",
+				"past_first_occ" : "description"
+			},
+			"geoLocations" : {
+				"flatten_all_in_this_key" : "geoLocations_raw"
+			},
+			"FundingReferences" : {
+				"flatten_all_in_this_key" : "FundingReferences_raw"
+			},
+			"source" : "string",
+			"isActive" : "string",
+			"state" : "string",
+			"viewCount" : "int",
+			"downloadCount" : "int",
+			"referenceCount" : "int",
+			"citationCount" : "int",
+			"versionCount" : "int",
+			"created" : "string",
+			"registered" : "string"
+		},
+		"relationships" : {
+			"client" : {
+				"go_to_sub_key" : "data",
+				"get_key" : "id"
+
+			},
+			"provider" : {
+				"go_to_sub_key" : "data",
+				"get_key" : "id"
+			}
+
+		}
+	}
+}
--- a/1-enrich-with-datacite/nb-dois.txt
+++ b/1-enrich-with-datacite/nb-dois.txt
+2712
\ No newline at end of file
--- a/1-enrich-with-datacite/z_personal_functions.py
+++ b/1-enrich-with-datacite/z_personal_functions.py
+import requests, json
+
+# Fonction pour éviter la redondance des données associées à des DOIs différents mais pointant vers les mêmes fichiers :
+# Dans Zenodo par exemple, il y a un DOI associé à chaque version d'un dépôt et il faut remonter au DOI "chapeau"
+# Si le DOI "chapeau" obtenu ou un "is_identical_to" fait référence à un DOI déjà existant dans le csv, il doit être ignoré.
+def get_origin_version(doi, history=[], first=True):
+    if first: history=[] # ligne ajoutée pour éviter certains soucis de cache où history n'est pas vide au premier appel de la fonction
+    req = requests.get( f"https://api.datacite.org/dois/{doi}" )
+    res = req.json()
+    final = []
+    result = (doi, history, final) # doi est le DOI qui sera ajouté au csv, history retrace les dois et les relations ayant permis les recherches et final enregistre les relations du doi final ajouté au csv
+    try:
+        related = res["data"]["attributes"]["relatedIdentifiers"] # test si des relations existent pour le doi courant
+    except:
+        pass # si pas de relation, on renvoie le doi courant
+    else:
+        ignore = False # ignore correspond à un doi ayant une version "chapeau" qui doit être trouvée. Le doi courant doit donc être ignoré
+        duplicate = False # duplicate correspond à un doi étant identique à un autre
+        for i in related:
+            final.append(i.get("relationType"))
+            if i.get("relationType") == "IsVersionOf" and i.get("relatedIdentifierType") == "DOI": 
+                ignore = True
+                elem_to_save_i = i.get("relatedIdentifier")
+                history.append([i.get("relationType"), i.get("relatedIdentifier")])
+            if i.get("relationType") == "IsIdenticalTo" and i.get("relatedIdentifierType") == "DOI":
+                duplicate = True
+                elem_to_save_d = i.get("relatedIdentifier") # pas de symétrie pour les is_identical_to, donc il suffit de prendre l'autre (pas le doi courant) pour éviter les doublons
+                history.append([i.get("relationType"), i.get("relatedIdentifier")])
+        if duplicate and not(ignore):
+            result = (elem_to_save_d, history, final) # si identique mais pas de version chapeau on peut s'arrêter
+        if ignore: result = get_origin_version(elem_to_save_i, history, False) # si version chapeau, on avance sans regarder les identiques
+    return result
+
+def get_md_from_datacite( doi ) : 
+    """
+	retrieve data research metadata from datacite
+    """
+
+    req = requests.get( f"https://api.datacite.org/dois/{doi}" )
+
+    try : 
+    	res = req.json()
+    except : 
+    	return "error"
+
+    if "errors" in res : 
+    	return "error"
+
+    return res   
+
+
+def parse_value_following_instruction(key, instruction, datacite_content) : 
+    """
+    permet de récuéprer les données reçues depuis datacite avec leur propre sturcturation selon les instructions précisées manuellement
+    key : la clé à traiter
+    instruction : les instructions à appliquer pour récupérer la valeur de la clé
+    datacite_content : le contenu de datacite à la clé précisé
+
+    la recherche des instructions est effectuée notamment depuis leur type (string ou dict) : à revoir pour plus de cohérence
+    """
+
+    buffer = {}  ## pour récupérer les données avec un dictionnaire
+    
+    ## quand les données à récupérer sont des objets simples, on récupère simplement la valeur
+    if instruction == "string" or instruction == "int" :
+        return {key: datacite_content}
+        
+    ## qaund les instructions sont fomratées en dict
+    if isinstance(instruction, dict) :
+        
+        ## si past_values_w_this_key est dans les instructions
+        if "past_values_w_this_key" in instruction :
+            ## pour débbugger 
+            ##print("attribute is", key)
+
+            ## quand les données sont directement sous forme de dict (eg. datatype)
+            if instruction["past_values_w_this_key"] in datacite_content :
+                temp_key_to_get = instruction["past_values_w_this_key"]
+                 
+                buffer.update(
+                    { instruction["past_values_w_this_key"] : datacite_content[temp_key_to_get] }
+                )
+            
+            ## quand les données sont des listes et qu'il faut itérer dessus
+            else :
+                all_vals = []
+                temp_key_to_get = instruction["past_values_w_this_key"]
+                # itérer sur les éléments, vérifier si la clé est bien présente, si oui ajouter la valeur
+                [all_vals.append( item[ temp_key_to_get ]) for item in datacite_content if item.get(temp_key_to_get)]
+                           
+                buffer.update(
+                    {temp_key_to_get : ",".join(all_vals)}
+                )
+
+            ## quand il faut sortir toutes les données brutes
+            if "flatten_all_in_this_key" in instruction : 
+                buffer.update(
+                    {instruction["flatten_all_in_this_key"] : str(datacite_content) }
+                )
+                
+        ## if past_first_occ in instruction
+        if "past_first_occ" in instruction : 
+            temp_key_to_get = instruction["past_first_occ"]
+            buffer.update(
+                {temp_key_to_get: datacite_content[0][temp_key_to_get]}
+            )
+
+        ## if go_to_sub_key in the instruction
+        if "go_to_sub_key" in instruction :
+            
+            temp_parent_key_to_get = instruction["go_to_sub_key"]
+            ## identifier les clés enfantes à retrouver dans dataCite
+            temp_child_key_to_get = instruction["get_key"]
+            
+            buffer.update(
+                {key : datacite_content[temp_parent_key_to_get][temp_child_key_to_get] }
+            )
+            
+    else : 
+        buffer.update(
+            {key : "to_do"}
+        )
+        
+    return buffer     
+
+
+
+def parse_datacite_md(raw_datacite_mds):
+    """
+    from json file load instruction
+    from DOI get datacite cotent
+    iterate on all datacite attributes
+        if data from datacite is needed get it with parse_value_following_instruction()
+    iterate on all datacite relationship
+        if data from datacite is needed get it with parse_value_following_instruction()
+    """
+    doi_md = {
+        "doi" : raw_datacite_mds["data"]["id"]
+    }
+    
+    ## ____0____ from json file load instructions
+    with open("datacite-parser-instructions.json") as parser_file : 
+        datacite_parser = json.load(parser_file)
+
+    ## liste de tous les attributs à récupérer
+    attributes_to_get = datacite_parser["path-and-fields"]["attributes"].keys()
+    relations_to_get = datacite_parser["path-and-fields"]["relationships"].keys()
+
+    ## ____1___ iterate on datacite attributes
+    for attribute_key in raw_datacite_mds["data"]["attributes"] :
+        attribute_value = raw_datacite_mds["data"]["attributes"][attribute_key]
+
+        ## ne pas prendre les valeurs si elles sont nulles (sauf pour les nb)
+        if not isinstance(attribute_value, int) and not attribute_value : 
+            # pour suivi print(f"{attribute_key} is empty")
+            continue
+       
+        ## si l'attribut fait parti de ceux à récupérer
+        if attribute_key in attributes_to_get : 
+            
+            ## redistribuer le nom de la clé et sa valeur
+            value_to_add = parse_value_following_instruction(
+                attribute_key,
+                datacite_parser["path-and-fields"]["attributes"][attribute_key], 
+                attribute_value)
+            
+            doi_md.update(value_to_add)
+
+    
+    ## ____2___ iterate on datacite relations
+    ### nb : on pourrait alléger en regroupant attribut et relationships
+    for relation_key in raw_datacite_mds["data"]["relationships"] :
+        relation_value = raw_datacite_mds["data"]["relationships"][relation_key]
+        
+        ## ne pas prendre les valeurs si elles sont nulles (sauf pour les nb)
+        if not isinstance(relation_key, int) and not relation_value : 
+            continue
+        
+        ## si la relation est préciser dans les instructions
+        if relation_key in relations_to_get :
+            ## to debug print("relation is", relation_key)
+            relation_to_add = parse_value_following_instruction(
+                relation_key,
+                datacite_parser["path-and-fields"]["relationships"][relation_key], 
+                relation_value
+            )
+            doi_md.update(relation_to_add)
+            
+    return doi_md
+
+
+def from_files_load_dois(files_name) :
+    """
+    load txt files which contains DOIs
+    """
+    all_dois = []
+	
+    for file_shortname in files_name :
+        file_fullname = file_shortname + "-dois.txt"
+        folder_path = "../0-collect-data/"
+        temp_loaded_dois = []
+    
+        with open(folder_path + file_fullname) as f:
+            ## attention à la fin des lignes il y a un retour à la ligne, d'où le (line)-1
+            ## lower DOI string also
+            [temp_loaded_dois.append( line[ :len(line)-1].lower()) for line in f.readlines()]
+    
+        print(f"\t{file_shortname:<10s}\t\t{len(temp_loaded_dois)}")
+        [all_dois.append(elem) for elem in temp_loaded_dois]
+    
+    
+    return all_dois
No results found