Newer
Older
import z_personal_functions as my_functions
import requests, json, random, pandas as pd
# ______0______ load DOIs and remove duplicate
## specifier la liste des entrepôts à importer
files_to_load = [ "zenodo", "datacite", "rdg", "bso-via-hal", "nakala" ]
dois_raw = my_functions.from_files_load_dois(files_to_load)
print("\n\tDOIs loaded\t\t\t", len(dois_raw))
dois = list(set(dois_raw)) ## remove duplicate
print("\tDOIs to treat\t\t", len(dois))
# ______1_____ load DOIs already treater & get md from DataCite for new one
## pour essayer avec un seul DOI
# temp_doi = dois[random.randint(0, len(dois))]
# print(temp_doi)
# raw_metadatas = my_functions.get_md_from_datacite(temp_doi)
doi_error = [] # retrieve doi error
temp_rows = [] # put data in dict before df
df_old = pd.read_csv("../dois-uga.csv")
# req dataCite and paste data following instructions
for doi in dois : #[:300]
## if doi already treated
if doi in df_old["doi"].values :
#print(f"\talready treated\t\t{doi}")
continue
### if doi not in datacite
if raw_md == "error" :
doi_error.append(doi)
continue
## ___n___ from manual instructions retrieve appropriate data
selected_md = my_functions.parse_datacite_md(raw_md) ## placer les resultats dans un dictionnaire
temp_rows.append(selected_md) ## ajouter ce dictionnaire à une liste
print(f"\tadded\t\t{doi}")
df_concat = pd.concat([df_old, df_fresh], ignore_index=True)
## remove not wanted datacite type
type_to_explude = ["Book", "ConferencePaper", "ConferenceProceeding", "JournalArticle", "BookChapter", "Service", "Preprint"]
df_out = df_concat[ ~df_concat["resourceTypeGeneral"].isin(type_to_explude) ].copy()
## output main CSV
df_out.to_csv("../dois-uga.csv", index = False)
print(f"\n\nnb of doi exported \t{len(df_out)}")
Elias Chetouane
committed
# write the number of dois found in a file to display on the website
with open("nb-dois.txt", 'w') as outf :
outf.write(str(len(df_out)))
Elias Chetouane
committed
## output another csv with datacite client and number of datasets
df_client_raw = df_out["client"].value_counts().to_frame()
## get informations about each client
client_names = []
client_years = []
client_urls = []
for i in range(0, len(df_client_raw)):
client = df_client_raw.iloc[i].name
req = requests.get('https://api.datacite.org/clients?query=id:%22'+str(client)+'%22')
client_names.append(req.json()["data"][0]["attributes"]["name"])
client_years.append(req.json()["data"][0]["attributes"]["year"])
client_urls.append(req.json()["data"][0]["attributes"]["url"])
## add informations to the output csv
df_client_raw["name"] = client_names
df_client_raw["year"] = client_years
df_client_raw["url"] = client_urls
df_client_raw.to_csv("all_datacite_clients_for_uga.csv")