Skip to content
Snippets Groups Projects
datacite.py 5.22 KiB
Newer Older
# récupérer les DOIs de l'UGA depuis Datacite
Maxence Larrieu's avatar
Maxence Larrieu committed
## 2023-12-01, Elias Chetouane, Maxence Larrieu
Maxence Larrieu's avatar
Maxence Larrieu committed
## Doc
* datacite API : https://support.datacite.org/docs/api
* Recherche d'un DOI : https://support.datacite.org/docs/api-sampling
* Pagination : https://support.datacite.org/docs/pagination
Maxence Larrieu's avatar
Maxence Larrieu committed
* nb : "DataCite added support for affiliation identifiers in Metadata Schema 4.3, released August 2019"
Maxence Larrieu's avatar
Maxence Larrieu committed
https://support.datacite.org/docs/can-i-see-more-detailed-affiliation-information-in-the-rest-api
Maxence Larrieu's avatar
Maxence Larrieu committed
* mémo : pour descendre au niveau des auteurs , le filtre `?person-id=orcid-nb`
Maxence Larrieu's avatar
Maxence Larrieu committed

Maxence Larrieu's avatar
Maxence Larrieu committed
* note 2023-02
retrait de l'AAU (jbru.aau) car tout est du PDF de congrès
Maxence Larrieu's avatar
Maxence Larrieu committed
ajout de client.uid:inist.sshade et client.uid:inist.resif
Maxence Larrieu's avatar
Maxence Larrieu committed
def get_results(query_from_list, view_results):
Maxence Larrieu's avatar
Maxence Larrieu committed
    """
    lance les requêtes dans l'API 
    """

    query_root = "https://api.datacite.org/dois?query="
    
    # les type de datasets à exclure (à voir si on retire text)
Maxence Larrieu's avatar
Maxence Larrieu committed
    # note 2023-12-15 ça retire seulement 15 DOIs
Maxence Larrieu's avatar
Maxence Larrieu committed
    query_filter_type = " AND resource_type_id:(-book -bookChapter -conferencePaper -conferenceProceeding -dissertation -event -journal -journalArticle -peerReview -preprint -report -service)"

    query_page = "&page[size]=100"

    req = requests.get(query_root + query_from_list +  query_filter_type + query_page)
Maxence Larrieu's avatar
Maxence Larrieu committed
    if view_results: 
        # print total dataset per query
        print(f"\n\t{req.url}")
        print(f"\t{results[0]['meta']['total']}")
    # obtenir les résultats de chaque page dans la liste results
    nb_pages = results[0]["meta"]["totalPages"]
    page = 1
    while(page < nb_pages):
        url = (results[page-1]["links"]["next"])
        req = requests.get(url)
        results.append(req.json())
        page += 1
    return results

Maxence Larrieu's avatar
Maxence Larrieu committed
    """
    Ajouter les datasets avec DOI dans une liste et identifier ceux sans DOI
    """
Maxence Larrieu's avatar
Maxence Larrieu committed
    temp_dois = []

    # prendre en compte les résultats de chaque page
    for res in results:
        num_dois = res["data"]
Maxence Larrieu's avatar
Maxence Larrieu committed
            item_id = item["id"]

Maxence Larrieu's avatar
Maxence Larrieu committed
            # si le dataset a un id qui correspond à un DOI
            if item.get("type") == "dois":
                temp_dois.append(item_id)
Maxence Larrieu's avatar
Maxence Larrieu committed
                ## si l'id n'est pas un DOI on l'ajoute ds une liste globale
                main_no_dois.append(item_id)
Maxence Larrieu's avatar
Maxence Larrieu committed
    return temp_dois


print("\n\nRunning datacite.py")

import requests, pandas as pd
main_dois = []
main_no_dois = []

## __________0__________ query with all RORs from UGA on authors and contributors
Maxence Larrieu's avatar
Maxence Larrieu committed
### load table containing finded ROR
df_raw = pd.read_csv("uga-find-ror-childs/UGA-ror-childs--2023-12-29--manual.csv")
#print("columns name: ", [colname for colname in df_raw.columns])
Maxence Larrieu's avatar
Maxence Larrieu committed
### select accurate ROR
df_hal = df_raw[ df_raw["docid"].notna() ] ## RORs finded via HAL
df_ror = df_raw[ df_raw["Unnamed: 6"] == "include"] ## RORs selected manually /!\ col name
rors = df_hal.ror.tolist() + df_ror.ror.tolist() 
rors.append("https://ror.org/02rx3b187") ## add the ROR from UGA ! 
print(f"\t__process by ROR\n\tnb of ROR loaded\t{len(rors)}")
Maxence Larrieu's avatar
Maxence Larrieu committed
# debug : try only with UGA ROR
# rors = ["https://ror.org/02rx3b187"]

Maxence Larrieu's avatar
Maxence Larrieu committed
for ror in rors : # to debug add [:1] 
Maxence Larrieu's avatar
Maxence Larrieu committed
    for auth_type in ["creators", "contributors"] :
        query = f"{auth_type}.affiliation.affiliationIdentifier:\"{ror}\""
        temp_doi_list = get_dois(get_results(query, False))
        [main_dois.append(elem) for elem in temp_doi_list]

print(f"\tnb DOIs finded \t{len(main_dois)}")


## __________1__________ query by datacite client and UGA as pupublisher
print(f"\n\t__process by datacite clients")
query_client_publisher = [
    "client_id:inist.osug",             # OSUG https://doi.osug.fr/
Maxence Larrieu's avatar
Maxence Larrieu committed
    "client.uid:inist.sshade",          # services nationaux d'observation portés par l'OSUG
    "client.uid:inist.resif",           # services nationaux d'observation portés par l'OSUG
Maxence Larrieu's avatar
Maxence Larrieu committed
    "client_id:inist.persyval",         # Labex Persyval-lab (PIA)
    "publisher:(grenoble AND alpes)"    # /!\ apporte du text
]
Maxence Larrieu's avatar
Maxence Larrieu committed
for query in query_client_publisher :
Maxence Larrieu's avatar
Maxence Larrieu committed
    temp_doi_list = get_dois(get_results(query, True))
    [main_dois.append(elem) for elem in temp_doi_list]
Maxence Larrieu's avatar
Maxence Larrieu committed
print(f"\tnb DOI finded \t{len(main_dois)}")
Maxence Larrieu's avatar
Maxence Larrieu committed
## __________n__________ if Datasets with other things that a DOI have been finded
if main_no_dois : 
    print("datasets with an other identifier than DOI has been finded")
    [print(f"\t\t{elem}") for elem in main_no_dois]
Maxence Larrieu's avatar
Maxence Larrieu committed
## __________n__________ remove DOIs who are newer version via Figshare repository
## example 10.6084/m9.figshare.23737431.v2
## peut être refait lors de la récupération en véridiant que relatedIdentifiers/relationType:"IsIdenticalTo" ne soit présent
doi_to_remove = []
for doi in main_dois : 
    if "figshare" in doi :
        # synthaxe : remove the number of the version
        doi_shorten = doi[: len(doi) - 1]
        if doi_shorten.endswith(".v") : 
            doi_to_remove.append(doi)

[main_dois.remove(elem) for elem in doi_to_remove]
Maxence Larrieu's avatar
Maxence Larrieu committed
## __________n__________ remove duplicates
unique_dois = list(set(main_dois))
print(f"\tNb of unique DOI\t{len(unique_dois)}")
Maxence Larrieu's avatar
Maxence Larrieu committed
# __________z__________ export DOIs in txt files
with open("datacite-dois.txt", 'w') as f :
    [f.write(f"{line}\n") for line in unique_dois]