add dataset type graph

046c0432 · Maxence Larrieu · 8c3677f3 · 046c0432 · 046c0432 · 046c0432
Commit 046c0432 authored 1 year ago by Maxence Larrieu
--- a/0-collect-data/datacite-dois.txt
+++ b/0-collect-data/datacite-dois.txt
--- a/0-collect-data/nakala-dois.txt
+++ b/0-collect-data/nakala-dois.txt
-10.34847/nkl.5bcck3cz
-10.34847/nkl.ca709965
 10.34847/nkl.a0fe865m
 10.34847/nkl.76abr599
 10.34847/nkl.6caam3dp
+10.34847/nkl.5bcck3cz
+10.34847/nkl.ca709965
 10.34847/nkl.ca8dmbdh
 10.34847/nkl.a5ae8y33
 10.34847/nkl.748eqz51
@@ -22,5 +22,11 @@
 10.34847/nkl.3dbc2mtb
 10.34847/nkl.bc2b1071
 10.34847/nkl.81dcdekj
-10.34847/nkl.ef903o6v
+10.34847/nkl.b1cb3arm
-10.34847/nkl.ae94a74k
+10.34847/nkl.c9e92or4
+10.34847/nkl.bf5f263z
+10.34847/nkl.9f85iol5
+10.34847/nkl.345bf9i7
+10.34847/nkl.9cd8hi4k
+10.34847/nkl.e1e41vdi
+10.34847/nkl.deb655as
--- a/0-collect-data/nakala-uga-users.txt
+++ b/0-collect-data/nakala-uga-users.txt
@@ -15,4 +15,6 @@ egreslou
 troulet
 mbeligne
 acarbonnelle
 annegf
\ No newline at end of file
+tleduc
+abey
\ No newline at end of file
--- a/0-collect-data/nakala.py
+++ b/0-collect-data/nakala.py
@@ -104,8 +104,9 @@ for user in nakala_uga_users :
 with open("nakala-dois.txt", 'w') as fh :
    [fh.write(f"{line}\n") for line in all_dois]
-## print les autres utilisateurs trouvés7
+## print les autres utilisateurs trouvés
-print("\n\n nakala new user finded ")
+if other_user_finded : 
-for elem in other_user_finded : 
+    print("\n\n nakala new user finded ")
-    print("\t\telem")
+    for elem in other_user_finded : 
+        print(f"\t\t{elem}")
--- a/0-collect-data/rdg-dois.txt
+++ b/0-collect-data/rdg-dois.txt
-10.57745/QOA1QO
+10.57745/52HT2L
-10.57745/GZKUZS
+10.57745/7RFNNP
-10.57745/J2A44Q
+10.57745/UOGRPY
-10.15454/M7OK9E
+10.57745/GRHRZJ
-10.57745/QOA1QO
+10.57745/7HF7KG
-10.57745/GZKUZS
-10.57745/NOHRHJ
-10.57745/JOZ1NA
-10.57745/BYWEA3
-10.57745/J2A44Q
-10.57745/QOA1QO
-10.57745/B6PSX0
-10.57745/BYWEA3
-10.57745/TVAHUQ
-10.57745/BYWEA3
-10.57745/QCVYG3
-10.57745/NGC4J0
-10.57745/HZDPTT
-10.57745/69UNAM
-10.57745/ENJADK
-10.57745/GZKUZS
 10.57745/ENJADK
-10.57745/LUTMNE
+10.57745/69UNAM
 10.57745/NZFWP9
-10.57745/LXTWNG
+10.57745/R1NIKK
+10.57745/QCVYG3
+10.57745/KTFZQD
+10.57745/B6PSX0
+10.15454/M7OK9E
+10.57745/CM2WOI
 10.57745/GZKUZS
+10.57745/NGC4J0
+10.57745/IZHDPC
+10.57745/LPJ2S2
 10.57745/W9N5Z9
-10.57745/RUQLJL
+10.57745/TVAHUQ
-10.15454/8UIA76
+10.57745/BYWEA3
-10.57745/OVCWQN
+10.15454/O93984
+10.57745/QOA1QO
+10.57745/YWBDQQ
+10.57745/JOZ1NA
+10.57745/XHQ7TL
+10.57745/ID1LS6
+10.57745/3VMB3Y
 10.57745/MXEMI4
+10.57745/NOHRHJ
+10.57745/OVCWQN
 10.57745/5O6QIH
-10.57745/KTFZQD
+10.57745/RUQLJL
-10.57745/R1NIKK
+10.57745/OT1IFB
-10.57745/IZHDPC
-10.57745/TOR3SF
 10.57745/Z3BG2U
-10.57745/7HF7KG
 10.57745/3D4DFW
-10.57745/OT1IFB
+10.57745/LXTWNG
-10.57745/XHQ7TL
+10.57745/LUTMNE
-10.57745/ID1LS6
+10.15454/8UIA76
-10.57745/52HT2L
+10.57745/TOR3SF
-10.57745/YWBDQQ
+10.57745/J2A44Q
-10.57745/LPJ2S2
+10.57745/HZDPTT
-10.57745/CM2WOI
-10.57745/3VMB3Y
-10.15454/O93984
-10.57745/GRHRZJ
-10.57745/7RFNNP
-10.57745/UOGRPY
--- a/0-collect-data/zenodo-dois.txt
+++ b/0-collect-data/zenodo-dois.txt
--- a/2-produce-graph/hist--datasets-by-year.png
+++ b/2-produce-graph/hist--datasets-by-year.png
--- a/2-produce-graph/pie--datacite-client.png
+++ b/2-produce-graph/pie--datacite-client.png
--- a/2-produce-graph/pie--datacite-type.png
+++ b/2-produce-graph/pie--datacite-type.png
--- a/2-produce-graph/pie-data-type.py
+++ b/2-produce-graph/pie-data-type.py
+import pandas as pd, matplotlib, matplotlib.pyplot  as plt
+import z_my_functions as my_fct
+import seaborn as sns
+import random
+df = my_fct.load_and_treat_csv()
+print(df.columns)
+df_type = df["resourceTypeGeneral"].value_counts()
+# print(df_type_raw)
+# ## regroup small values in "other"
+# treshold = 20
+# df_type = df_type_raw[df_type_raw > treshold]
+# df_type["other"] = df_type[df_type <= treshold].sum()
+#define Seaborn color palette to use
+colors = sns.color_palette('pastel')[0:len(df_type)]
+random.shuffle(colors) ## so that blue is not more the first item
+plt.pie(df_type, colors = colors, autopct=lambda p: '{:.0f}%'.format(round(p)) if p > 1 else '', startangle = 160)
+## auto pct only if value > 1
+plt.legend(df_type.index, loc = (0.7, -0.1) )
+plt.title(f"Type of datasets", fontsize = 20, x = 0.5, y = 1.03, alpha = 0.6)
+plt.suptitle(f"n = {len(df)}", fontsize = 11, x = 0.5, y = 0.9, alpha = 0.6)
+plt.savefig("pie--datacite-type.png")
+# print(len(df))
\ No newline at end of file
--- a/2-produce-graph/pie-datacite-client.py
+++ b/2-produce-graph/pie-datacite-client.py
@@ -17,7 +17,7 @@ clients_name = []
 for item in df_client.index :
 	short_name = item[: item.find(".")]
 	if short_name not in ["inist", "jbru"] : 
-		clients_name.append( short_name)
+		clients_name.append( short_name.capitalize())
 	else : 
 		clients_name.append(item)
@@ -32,6 +32,7 @@ colors = sns.color_palette('pastel')[0:len(df_client)]
 plt.pie(df_client, labels = clients_name, colors = colors, autopct='%.0f%%')
 plt.title(f"Distribution of datasets by DataCite client", fontsize = 20, x = 0.5, y = 1.03, alpha = 0.6)
+plt.suptitle(f"n = {len(df)}", fontsize = 11, x = 0.5, y = 0.90, alpha = 0.6)
 plt.savefig("pie--datacite-client.png")

--- a/2-produce-graph/z_my_functions.py
+++ b/2-produce-graph/z_my_functions.py
@@ -6,7 +6,7 @@ def load_and_treat_csv() :
 	df_raw = pd.read_csv("../dois-uga.csv", index_col=False)
 	## remove datacite type that are not "research data"
-	type_to_explude = ["Book", "ConferencePaper", "JournalArticle", "BookChapter", "Service", "Preprint"]
+	type_to_explude = ["Book", "ConferencePaper", "ConferenceProceeding", "JournalArticle", "BookChapter", "Service", "Preprint"]
 	df = df_raw[ ~df_raw["resourceTypeGeneral"].isin(type_to_explude) ].copy()
 	return df

--- a/dois-uga.csv
+++ b/dois-uga.csv
--- a/run-all-codes.py
+++ b/run-all-codes.py
@@ -19,4 +19,9 @@ file_names = [
 	"rdg.py"
 ]
-execute_python_file(file_names[1])
+for file in file_names : 
+	execute_python_file(file)
+# execute_python_file(file_names[1])