remove non wanted DC clients

38e29814 · Maxence Larrieu · 77cb87c6 · 38e29814 · 38e29814 · 38e29814
Commit 38e29814 authored 1 year ago by Maxence Larrieu
--- a/0-collect-data/zenodo-dois.txt
+++ b/0-collect-data/zenodo-dois.txt
@@ -767,4 +767,4 @@
 10.5281/zenodo.4784408
 10.5281/zenodo.6397629
 10.5281/zenodo.3611936
-10.5281/zenodo.7795898
+10.5281/zenodo.7795898
\ No newline at end of file
--- a/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
+++ b/1-enrich-with-datacite/all_datacite_clients_for_uga.csv
@@ -10,26 +10,24 @@ rdg.prod,43,Recherche Data Gouv France,2022,https://recherche.data.gouv.fr/en
 inist.humanum,26,Huma-Num,2020,https://nakala.fr
 figshare.sage,14,figshare SAGE Publications,2018,
 mcdy.dohrmi,12,dggv-e-publications,2020,https://www.dggv.de/publikationen/dggv-e-publikationen.html
-rg.rg,4,ResearchGate,2016,https://www.researchgate.net/search/data
 iris.iris,3,Incorporated Research Institutions for Seismology,2018,http://www.iris.edu/hq/
 vqpf.dris,3,Direction des ressources et de l'information scientifique,2021,
 tib.gfzbib,3,GFZpublic,2011,https://gfzpublic.gfz-potsdam.de
 tib.repod,2,RepOD,2015,
 ugraz.unipub,2,unipub,2019,http://unipub.uni-graz.at
 bl.nerc,2,NERC Environmental Data Service,2011,https://eds.ukri.org
-inist.epure,2,Éditions et presses universitaires de Reims,2020,
-inist.opgc,1,Observatoire de Physique du Globe de Clermont-Ferrand,2017,
-ardcx.nci,1,National Computational Infrastructure,2020,
-umass.uma,1,University of Massachusetts (UMass) Amherst,2018,https://scholarworks.umass.edu/
-bl.mendeley,1,Mendeley Data,2015,https://data.mendeley.com/
-inist.eost,1,Ecole et Observatoire des Sciences de la Terre,2017,https://eost.unistra.fr/en/
-crui.ingv,1,Istituto Nazionale di Geofisica e Vulcanologia (INGV),2013,http://data.ingv.it/
-bl.iita,1,International Institute of Tropical Agriculture datasets,2017,http://data.iita.org/
 ihumi.pub,1,IHU Méditerranée Infection,2020,
-inist.omp,1,Observatoire Midi-Pyrénées,2011,
-inist.ird,1,IRD,2016,
-tib.gfz,1,GFZ Data Services,2011,https://dataservices.gfz-potsdam.de/portal/
+ethz.zora,1,"Universität Zürich, ZORA",2013,https://www.zora.uzh.ch/
 edi.edi,1,Environmental Data Initiative,2017,https://portal.edirepository.org/nis/home.jsp
+tib.gfz,1,GFZ Data Services,2011,https://dataservices.gfz-potsdam.de/portal/
+inist.ird,1,IRD,2016,
+inist.omp,1,Observatoire Midi-Pyrénées,2011,
 tug.openlib,1,TU Graz OPEN Library,2020,https://openlib.tugraz.at/
-ethz.zora,1,"Universität Zürich, ZORA",2013,https://www.zora.uzh.ch/
+inist.opgc,1,Observatoire de Physique du Globe de Clermont-Ferrand,2017,
+crui.ingv,1,Istituto Nazionale di Geofisica e Vulcanologia (INGV),2013,http://data.ingv.it/
+inist.eost,1,Ecole et Observatoire des Sciences de la Terre,2017,https://eost.unistra.fr/en/
+bl.mendeley,1,Mendeley Data,2015,https://data.mendeley.com/
+bl.iita,1,International Institute of Tropical Agriculture datasets,2017,http://data.iita.org/
+umass.uma,1,University of Massachusetts (UMass) Amherst,2018,https://scholarworks.umass.edu/
+ardcx.nci,1,National Computational Infrastructure,2020,
 estdoi.ttu,1,TalTech,2019,https://digikogu.taltech.ee
--- a/1-enrich-with-datacite/concatenate-enrich-dois.py
+++ b/1-enrich-with-datacite/concatenate-enrich-dois.py
@@ -20,7 +20,7 @@ print("\tDOIs to treat\t\t", len(dois))

 ## pour essayer avec un seul DOI
 # temp_doi = dois[random.randint(0, len(dois))]
-# #temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509
+# temp_doi = "10.57745/QYIAWX" - 10.25656/01:8509
 # print(temp_doi)
 # raw_metadatas = my_functions.get_md_from_datacite(temp_doi)

@@ -28,6 +28,7 @@ doi_error = [] # retrieve doi error
 temp_rows = [] # put data in dict before df

 df_old = pd.read_csv("../dois-uga.csv")
+
 print(f"\n\tnb of DOIs already treated\t{len(df_old)}")

 # req dataCite and paste data following instructions
@@ -57,9 +58,11 @@ if temp_rows :
 	df_fresh = pd.DataFrame(temp_rows)
 	df_concat = pd.concat([df_old, df_fresh], ignore_index=True)

-	## remove not wanted datacite type
+	## remove not wanted datacite type & clients
 	type_to_explude = ["Book", "ConferencePaper", "ConferenceProceeding", "JournalArticle", "BookChapter", "Service", "Preprint"]
-	df_out = df_concat[ ~df_concat["resourceTypeGeneral"].isin(type_to_explude) ].copy()
+	clients_to_exclude = ["rg.rg", "inist.epure"]
+
+	df_out = df_concat[ ~df_concat["resourceTypeGeneral"].isin(type_to_explude) & ~df_concat["client"].isin(clients_to_exclude) ].copy()
 	
 	## output main CSV
 	df_out.to_csv("../dois-uga.csv", index = False)
@@ -69,7 +72,7 @@ if temp_rows :
 	with open("nb-dois.txt", 'w') as outf :
 		outf.write(str(len(df_out)))

-	## output another csv with datacite client and number of datasets
+	## for the website : output another csv with datacite client and number of datasets
 	df_client_raw = df_out["client"].value_counts().to_frame()

 	## get informations about each client

--- a/1-enrich-with-datacite/nb-dois.txt
+++ b/1-enrich-with-datacite/nb-dois.txt
-2075
\ No newline at end of file
+2069
\ No newline at end of file
--- a/dois-uga.csv
+++ b/dois-uga.csv