Forked from
Maxence Larrieu / UGA Open Research Data Monitor backend
188 commits behind the upstream repository.
-
Maxence Larrieu authoredMaxence Larrieu authored
hist-evol-datasets-per-repo.py 2.67 KiB
import pandas as pd, matplotlib, matplotlib.pyplot as plt
import z_my_functions as my_fct
from matplotlib import colormaps
df = my_fct.load_and_treat_csv()
print(df.columns)
## print this to see raw data & adapt graph
# print(df.client.value_counts())
## create a col with yyyy-mm
df["year-month"] = df["registered"].str[:7]
# print(df["year-month"][:1])
## produce a col with datacite clients limited to top 10 in quantity
def reduce_client(client) :
"""
réduire la liste de client
fait le 2024-01-12
matching à surveiller régulièrement
"""
if client == "cern.zenodo" :
return "Zenodo"
if client.startswith("figshare") :
return "Figshare"
if client == "inist.osug" or client == "inist.persyval" or client == "inist.sshade" :
return "OSUG"
if client == "dryad.dryad" :
return "Dryad"
if client == "jbru.aau" :
return "AAU"
if client == "rdg.prod" :
return "RDG"
if client == "inist.humanum" :
return "Nakala"
else :
return "other"
df["client_reduced"] = df.apply(lambda row : reduce_client(str(row.client)) , axis = 1)
df_evol_linear = pd.crosstab(df["year-month"], df["client_reduced"])
df_evol_linear.index.rename("year-month", inplace = True)
## make count value cumulatif "cumsum"
df_evol = df_evol_linear.cumsum(axis='index')
## ______0______ produce graphs
fig, (ax) = plt.subplots(figsize=(10, 7), dpi = 100, facecolor='w', edgecolor='k')
## a set of color via plt
### see color palett https://matplotlib.org/stable/users/explain/colors/colormaps.html
colors = [plt.cm.Set3(i) for i in range(len(df_evol.columns))]
plt.stackplot(
df_evol.index,
[df_evol[col].tolist() for col in df_evol.columns],
labels = df_evol.columns,
colors = colors,
baseline = "zero")
plt.legend(loc="center", reverse = True, bbox_to_anchor=(0.49, 0.65), fontsize = 11)
# ______0______ configurer le rendu
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylabel("Total of datasets", labelpad = 10)
#ax.set_xlabel("Date of DOI registration", labelpad = 10)
ax.yaxis.grid(ls=":", alpha=0.5)
## x label only for January
## we need idx for df for january plus label
x_idx_toshow = []
x_label_toshow = []
for i, date in enumerate(df_evol.index) :
# sil s'agit bien du mois de janvier
if date.endswith("-01") :
x_idx_toshow.append(i)
x_label_toshow.append(date[:-3])
ax.set_xticks(x_idx_toshow)
ax.set_xticklabels(x_label_toshow, rotation=70, fontsize=10)
plt.title(f"Evolution of the quantity of UGA open datasets\n and distribution per repository", \
fontsize = 18, x = 0.5, y = 1.03, alpha = 0.8)
plt.suptitle(f"n = {len(df)}", fontsize = 12, x = 0.5, y = 0.87, alpha = 0.6)
#plt.show()
plt.savefig("hist-evol-datasets-per-repo.png")