import z_personal_functions as my_functions import requests, json, random, pandas as pd print("\n\nRunning concatenate-enrich-dois.py") # ______0______ load DOIs and remove duplicate ## specifier la liste des entrepôts à importer files_to_load = [ "zenodo", "datacite", "rdg", "bso-via-hal", "nakala" ] dois_raw = my_functions.from_files_load_dois(files_to_load) print("\n\tDOIs loaded\t\t\t", len(dois_raw)) dois = list(set(dois_raw)) ## remove duplicate print("\tDOIs to treat\t\t", len(dois)) # ______1_____ load DOIs already treater & get md from DataCite for new one ## pour essayer avec un seul DOI # temp_doi = dois[random.randint(0, len(dois))] # #temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509 # print(temp_doi) # raw_metadatas = my_functions.get_md_from_datacite(temp_doi) doi_error = [] # retrieve doi error temp_rows = [] # put data in dict before df df_old = pd.read_csv("../dois-uga.csv") print(f"\n\tnb of DOIs already treated\t{len(df_old)}") # req dataCite and paste data following instructions for doi in dois : #[:300] ## if doi already treated if doi in df_old["doi"].values : #print(f"\talready treated\t\t{doi}") continue ## ___n___ get md from datacite raw_md = my_functions.get_md_from_datacite(doi) ## to debug print(f"\t{doi}") ### if doi not in datacite if raw_md == "error" : doi_error.append(doi) continue ## ___n___ from manual instructions retrieve appropriate data selected_md = my_functions.parse_datacite_md(raw_md) ## placer les resultats dans un dictionnaire temp_rows.append(selected_md) ## ajouter ce dictionnaire à une liste print(f"\tadded\t\t{doi}") ## if new datasets has been founded if temp_rows : df_fresh = pd.DataFrame(temp_rows) df_concat = pd.concat([df_old, df_fresh], ignore_index=True) ## remove not wanted datacite type type_to_explude = ["Book", "ConferencePaper", "ConferenceProceeding", "JournalArticle", "BookChapter", "Service", "Preprint"] df_out = df_concat[ ~df_concat["resourceTypeGeneral"].isin(type_to_explude) ].copy() ## output main CSV df_out.to_csv("../dois-uga.csv", index = False) print(f"\n\nnb of doi exported \t{len(df_out)}") # write the number of dois found in a file to display on the website with open("nb-dois.txt", 'w') as outf : outf.write(str(len(df_out))) ## output another csv with datacite client and number of datasets df_client_raw = df_out["client"].value_counts().to_frame() ## get informations about each client client_names = [] client_years = [] client_urls = [] for i in range(0, len(df_client_raw)): client = df_client_raw.iloc[i].name req = requests.get('https://api.datacite.org/clients?query=id:%22'+str(client)+'%22') client_names.append(req.json()["data"][0]["attributes"]["name"]) client_years.append(req.json()["data"][0]["attributes"]["year"]) client_urls.append(req.json()["data"][0]["attributes"]["url"]) ## add informations to the output csv df_client_raw["name"] = client_names df_client_raw["year"] = client_years df_client_raw["url"] = client_urls df_client_raw.to_csv("all_datacite_clients_for_uga.csv")