Commit 03e37b27 authored by Jonathan Schaeffer's avatar Jonathan Schaeffer
Browse files
parents 736d44de 638ab314
......@@ -18,6 +18,14 @@ Les configurations se font par variable d'environnement :
* `RESIFDD_START_AT` : permet d'indiquer une année à partir de laquelle reprendre le transfert. Tous les éléments appartenant à une année inférieur sont ignorés
* `RESIFDD_KEYFILE` : si cette variable indique le chemin d'un fichier valide, alors il sera utilisé pour transférer les données correspondantes aux clés listées dans le fichier.
* `GITLAB_TOKEN` : si cette variable est fournie, alors le script va récupérer tous les projets GIT pour les sauvegarder. Ce token doit avoir les privilègres `read_api` et `read_repository` sur tout le groupre OSUG/RESIF.
Et pour la connexion à la base de donnée, il faut renseigner :
* `PGHOST`: le serveur postgres (par défaut `localhost`)
* `PGPORT` : le port de connexion (par défaut 5432)
* `PGUSER` : l'utilisateur pour la connexion. Il doit avoir les droits SELECT sur la table rall
* `PGDATABASE` : la base de donnée d'inventaire
* `PGPASSWORD` : le mot de passe postgres
## Exemples
### Lancer tout le dump
......
#!/bin/bash
############# variables ################
# authentification username
username="ccdump"
# token for ccdump
TOKEN="emN7szEu-hchxv-eYS6D"
# url prefix in children.json file
PREFIX="http_url_to_repo"
# number of results per pages (max = 100)
per_page=50
# les repertoires git de OSUG/RESIF/ se trouvent dans le fichier children.json
url_repositories="https://gricad-gitlab.univ-grenoble-alpes.fr/api/v4/groups/192/projects?simple=true&include_subgroups=true&per_page=$per_page"
############ script ##################
# total number of projects
total=$(curl -i -s --header "PRIVATE-TOKEN: $TOKEN" "$url_repositories" | awk '/x-total:/ {printf "%.0f\n", $2}')
# max page calculation
max_page=$(($total / $per_page +1))
for page in `seq $max_page`; do
# project list creation
project_list=$(curl -s --header "PRIVATE-TOKEN: $TOKEN" $url_repositories"&page="$page | grep -o "\"$PREFIX\":[^ ,]\+" | xargs -n 1|sed -e 's/http_url_to_repo:https:\/\///')
for url in ${project_list[@]}; do
# ygricad-gitlab.univ-grenoble-alpes.fr/OSUG/RESIF/ws-availability-k8s.git
pattern="^.*\/OSUG\/RESIF\/(.*\/){0,1}(.+.git)$"
if [[ "$url" =~ $pattern ]]; then
destdir=$RESIFDD_WORKDIR/projetcs/${BASH_REMATCH[1]}
project=${BASH_REMATCH[2]}
echo "Cloning $url in $destdir/$project"
mkdir -p $destdir
git clone -q --bare https://$username:$TOKEN@$url $destdir/$project
fi
done
done
#!/bin/bash
#
# Auteur: Jonathan Schaeffer <jonathan.schaeffer@univ-grenoble-alpes.fr>
# This script dumps precious RESIF data and metadata to an IRODS server
#
......@@ -171,6 +172,10 @@ pack_and_send() {
[ $# -eq 0 ] && echo "[pack_and_send] Need a path for data to send" && return 1
# Parse path to get year, station and network
dir=$1
# First test if path exists, else return
if [[ ! -d $dir ]]; then
return 1
fi
IFS='/' read -r -a YNS <<< $dir
[ ${#YNS[@]} -lt 4 ] && echo "[pack_and_send] Path $dir is not complete (${#YNS[@]} levels) " && return 2
YEAR=${YNS[-3]}
......@@ -188,6 +193,7 @@ pack_and_send() {
RC=0
fi
fi
# Connect to inventory database to check for recent files.
echo "[$KEY] Creating tar on $RESIFDD_WORKDIR/$KEY.tar"
tar_cmd="tar cf $RESIFDD_WORKDIR/$KEY.tar -C ${dir%$YEAR/$NETWORK/$STATION} ${YEAR}/${NETWORK}/${STATION}"
echo "[$KEY] $tar_cmd"
......@@ -224,6 +230,11 @@ pack_and_send() {
}
export -f pack_and_send # Necessary for call with GNU parallel
# This function identifies directories to dump using the inventory database
get_modified_directories() {
}
####################
#
# Preliminary tests
......@@ -250,6 +261,31 @@ if [[ ! -d $RESIFDD_DATADIR ]]; then
exit 1
fi
# Test for starting date. This is a mandatory parameter now.
if [[ -r ${RESIFDD_SINCE} ]]; then
echo "Dumping every change since ${RESIFDD_SINCE}"
# Try to interpret the date format right
from_date=$(date -d "${RESIFDD_SINCE}" +%Y-%m-%d)
if [[ $? -ne 0 ]]; then
echo "Date format not recognized. Please specify a date like RESIFDD_SINCE=2022-06-01"
exit 1
fi
else
echo "No date provided to start from. Please specify a date like RESIFDD_SINCE=2022-06-01"
exit 1
fi
# Test for database access in order to check the changes
psql -qtA -c "SELECT * from rall limit 1;"
if [[ $? -ne 0 ]]; then
echo "Unable to connect to database. Please specify all necessary environment variables for connection to inventory database:"
echo " - PGHOST=resif-pgprod.u-ga.fr"
echo " - PGPORT=5432"
echo " - PGDATABASE=resifInv-Prod"
echo " - PGUSER=resifinv_ro"
echo " - PGPASSWORD=XXXXXXXX"
echo "Make sure connection to database works without interaction and access is granted to table rall."
exit 1
fi
####################
#
......@@ -360,6 +396,15 @@ else
ils -l $KEY
fi
####################
# List all directory that have new data
#
# From all row in rall where created_at between the RESIFDD_SINCE parameter and the beginning of this month
#
####################
target_directories=$(psql -qtAc "select distinct case when n.endtime='infinity' then n.network||'/'||r.year||'/'||r.station else n.network||n.start_year||'/'||r.year||'/'||r.station end from rall as r LEFT JOIN networks as n on r.network_id=n.network_id where r.created_at between '$from_date' and '$(date +%Y-%m-01)';")
####################ddd
####################
#
# Start dumping validated data
......@@ -373,9 +418,8 @@ if [[ ! -d $SNAPSHOT_DIR ]]; then
exit 1
fi
parallel_jobs=3
echo "Starting dump of validated data with $parallel_jobs jobs"
find $SNAPSHOT_DIR -maxdepth 3 -mindepth 3 -type d | sort | parallel --jobs $parallel_jobs --max-args 1 pack_and_send {} {%}
echo "Starting dump of validated data with 4 jobs"
echo "$target_directories" | awk '$0="$SNAPSHOT_DIR/"$0' | parallel --jobs 4 --max-args 1 pack_and_send {} {%}
echo "Dump of validated data done"
####################
......@@ -390,6 +434,6 @@ if [[ ! -d $SNAPSHOT_DIR ]]; then
exit 1
fi
echo "Starting dump of cold data with $parallel_jobs jobs"
find $SNAPSHOT_DIR -maxdepth 3 -mindepth 3 -type d | sort | parallel --jobs $parallel_jobs --max-args 1 pack_and_send {} {%}
echo "Starting dump of cold data with 4 jobs"
echo "$target_directories" | awk '$0="$SNAPSHOT_DIR/"$0' | parallel --jobs 4 --max-args 1 pack_and_send {} {%}
echo "Dump of cold validated data done"
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment