Skip to content
Snippets Groups Projects
hist-evol-datasets-per-repo.py 2.67 KiB
Newer Older
Maxence Larrieu's avatar
Maxence Larrieu committed
import pandas as pd, matplotlib, matplotlib.pyplot  as plt
import z_my_functions as my_fct
from matplotlib import colormaps

df = my_fct.load_and_treat_csv()

print(df.columns)
## print this to see raw data & adapt graph
# print(df.client.value_counts())

## create a col with yyyy-mm
df["year-month"] = df["registered"].str[:7]

# print(df["year-month"][:1])

## produce a col with datacite clients limited to top 10 in quantity
def reduce_client(client) :
	"""
	réduire la liste de client
	fait le 2024-01-12
	matching à surveiller régulièrement
	"""

	if client == "cern.zenodo" : 
		return "Zenodo"

	if client.startswith("figshare") : 
		return "Figshare"

	if client == "inist.osug" or client == "inist.persyval" or client == "inist.sshade" : 
		return "OSUG"

	if client == "dryad.dryad" :
		return "Dryad"

	if client == "jbru.aau" : 
		return "AAU"

	if client == "rdg.prod" : 
		return "RDG"

	if client == "inist.humanum" : 
		return "Nakala"

	else : 
		return "other"

df["client_reduced"] = df.apply(lambda row : reduce_client(str(row.client)) , axis = 1)

df_evol_linear = pd.crosstab(df["year-month"], df["client_reduced"])
df_evol_linear.index.rename("year-month", inplace = True)

## make count value cumulatif "cumsum"
df_evol = df_evol_linear.cumsum(axis='index')

## ______0______ produce graphs
fig, (ax) = plt.subplots(figsize=(10, 7), dpi = 100, facecolor='w', edgecolor='k')

## a set of color via plt
### see color palett https://matplotlib.org/stable/users/explain/colors/colormaps.html
colors = [plt.cm.Set3(i) for i in range(len(df_evol.columns))]


plt.stackplot(
	df_evol.index, 
	[df_evol[col].tolist() for col in df_evol.columns],
	labels = df_evol.columns, 
	colors = colors, 
	baseline = "zero")


plt.legend(loc="center", reverse = True, bbox_to_anchor=(0.49, 0.65), fontsize = 11)


# ______0______ configurer le rendu
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_ylabel("Total of datasets", labelpad = 10)
#ax.set_xlabel("Date of DOI registration", labelpad = 10)

ax.yaxis.grid(ls=":", alpha=0.5)

## x label only for January
## we need idx for df for january plus label
x_idx_toshow = []
x_label_toshow = [] 
for i, date in enumerate(df_evol.index) :
	# sil s'agit bien du mois de janvier 
	if date.endswith("-01") :
		x_idx_toshow.append(i)
		x_label_toshow.append(date[:-3])


ax.set_xticks(x_idx_toshow)
ax.set_xticklabels(x_label_toshow, rotation=70, fontsize=10)


plt.title(f"Evolution of the quantity of UGA open datasets\n and distribution per repository", \
	fontsize = 18, x = 0.5, y = 1.03, alpha = 0.8)
plt.suptitle(f"n = {len(df)}", fontsize = 12, x = 0.5, y = 0.87, alpha = 0.6)

Maxence Larrieu's avatar
Maxence Larrieu committed
#plt.show()
Maxence Larrieu's avatar
Maxence Larrieu committed
plt.savefig("hist-evol-datasets-per-repo.png")