Newer
Older
# récupérer les DOIs de l'UGA depuis Datacite
## Doc
* datacite API : https://support.datacite.org/docs/api
* Recherche d'un DOI : https://support.datacite.org/docs/api-sampling
* Pagination : https://support.datacite.org/docs/pagination
* nb : "DataCite added support for affiliation identifiers in Metadata Schema 4.3, released August 2019"
https://support.datacite.org/docs/can-i-see-more-detailed-affiliation-information-in-the-rest-api
* mémo : pour descendre au niveau des auteurs , le filtre `?person-id=orcid-nb`
* note 2023-02
retrait de l'AAU (jbru.aau) car tout est du PDF de congrès
ajout de client.uid:inist.sshade et client.uid:inist.resif
"""
lance les requêtes dans l'API
"""
query_root = "https://api.datacite.org/dois?query="
# les type de datasets à exclure (à voir si on retire text)
query_filter_type = " AND resource_type_id:(-book -bookChapter -conferencePaper -conferenceProceeding -dissertation -event -journal -journalArticle -peerReview -preprint -report -service)"
query_page = "&page[size]=100"
req = requests.get(query_root + query_from_list + query_filter_type + query_page)
Elias Chetouane
committed
results = [req.json()]
if view_results:
# print total dataset per query
print(f"\n\t{req.url}")
print(f"\t{results[0]['meta']['total']}")
Elias Chetouane
committed
# obtenir les résultats de chaque page dans la liste results
nb_pages = results[0]["meta"]["totalPages"]
page = 1
while(page < nb_pages):
url = (results[page-1]["links"]["next"])
req = requests.get(url)
results.append(req.json())
page += 1
return results
Elias Chetouane
committed
def get_dois(results):
"""
Ajouter les datasets avec DOI dans une liste et identifier ceux sans DOI
"""
Elias Chetouane
committed
# prendre en compte les résultats de chaque page
for res in results:
num_dois = res["data"]
Elias Chetouane
committed
for item in num_dois :
# si le dataset a un id qui correspond à un DOI
if item.get("type") == "dois":
temp_dois.append(item_id)
Elias Chetouane
committed
else:
## si l'id n'est pas un DOI on l'ajoute ds une liste globale
main_no_dois.append(item_id)
return temp_dois
print("\n\nRunning datacite.py")
import requests, pandas as pd
main_dois = []
main_no_dois = []
## __________0__________ query with all RORs from UGA on authors and contributors
Elias Chetouane
committed
### load table containing finded ROR
df_raw = pd.read_csv("uga-find-ror-childs/UGA-ror-childs--2023-12-29--manual.csv")
#print("columns name: ", [colname for colname in df_raw.columns])
### select accurate ROR
df_hal = df_raw[ df_raw["docid"].notna() ] ## RORs finded via HAL
df_ror = df_raw[ df_raw["Unnamed: 6"] == "include"] ## RORs selected manually /!\ col name
rors = df_hal.ror.tolist() + df_ror.ror.tolist()
rors.append("https://ror.org/02rx3b187") ## add the ROR from UGA !
print(f"\t__process by ROR\n\tnb of ROR loaded\t{len(rors)}")
# debug : try only with UGA ROR
# rors = ["https://ror.org/02rx3b187"]
for auth_type in ["creators", "contributors"] :
query = f"{auth_type}.affiliation.affiliationIdentifier:\"{ror}\""
temp_doi_list = get_dois(get_results(query, False))
[main_dois.append(elem) for elem in temp_doi_list]
print(f"\tnb DOIs finded \t{len(main_dois)}")
## __________1__________ query by datacite client and UGA as pupublisher
print(f"\n\t__process by datacite clients")
query_client_publisher = [
"client_id:inist.osug", # OSUG https://doi.osug.fr/
"client.uid:inist.sshade", # services nationaux d'observation portés par l'OSUG
"client.uid:inist.resif", # services nationaux d'observation portés par l'OSUG
"client_id:inist.persyval", # Labex Persyval-lab (PIA)
"publisher:(grenoble AND alpes)" # /!\ apporte du text
]
temp_doi_list = get_dois(get_results(query, True))
[main_dois.append(elem) for elem in temp_doi_list]
Elias Chetouane
committed
## __________n__________ if Datasets with other things that a DOI have been finded
if main_no_dois :
print("datasets with an other identifier than DOI has been finded")
[print(f"\t\t{elem}") for elem in main_no_dois]
## __________n__________ remove DOIs who are newer version via Figshare repository
## example 10.6084/m9.figshare.23737431.v2
## peut être refait lors de la récupération en véridiant que relatedIdentifiers/relationType:"IsIdenticalTo" ne soit présent
doi_to_remove = []
for doi in main_dois :
if "figshare" in doi :
# synthaxe : remove the number of the version
doi_shorten = doi[: len(doi) - 1]
if doi_shorten.endswith(".v") :
doi_to_remove.append(doi)
[main_dois.remove(elem) for elem in doi_to_remove]
## __________n__________ remove duplicates
unique_dois = list(set(main_dois))
print(f"\tNb of unique DOI\t{len(unique_dois)}")
with open("datacite-dois.txt", 'w') as f :
Elias Chetouane
committed
[f.write(f"{line}\n") for line in unique_dois]