#!/bin/sh
#/ Usage: ghe-backup-repositories-rsync
#/ Take an online, incremental snapshot of all Git repository data.
#/
#/ Note: This command typically isn't called directly. It's invoked by
#/ ghe-backup when the rsync strategy is used.
set -e

# This command is designed to allow for transferring active Git repository data
# from a GitHub instance to a backup site in a way that ensures data is
# captured in a consistent state even when being written to.
#
# - All Git GC operations are disabled on the GitHub instance for the duration of
#   the backup. This removes the possibly of objects or packs being removed
#   while the backup is in progress.
#
# - In progress Git GC operations are given a cooldown window to complete. The
#   script will sleep for up to 60 seconds waiting for GC operations to finish.
#
# - Git repository data is transferred in a specific order: auxiliary files,
#   packed refs, loose refs, reflogs, and finally objects and pack files in that
#   order. This ensures that all referenced objects are captured.
#
# - Git GC operations are re-enabled on the GitHub instance.
#
# The script uses multiple runs of rsync to transfer repository files. Each run
# includes a list of filter rules that ensure only specific types of files are
# transferred.
#
# See the "FILTER RULES" and "INCLUDE/EXCLUDE PATTERN RULES" sections of the
# rsync(1) manual for more information:
#      <http://rsync.samba.org/ftp/rsync/rsync.html>

# Bring in the backup configuration
cd $(dirname "$0")/../..
. share/github-backup-utils/ghe-backup-config

# Set up remote host and root backup snapshot directory based on config
host="$GHE_HOSTNAME"
backup_dir="$GHE_SNAPSHOT_DIR/repositories"

# Location of last good backup for rsync --link-dest
backup_current="$GHE_DATA_DIR/current/repositories"

# Verify rsync is available.
if ! rsync --version 1>/dev/null 2>&1; then
    echo "Error: rsync not found." 1>&2
    exit 1
fi

# Perform a host-check and establish GHE_REMOTE_XXX variables.
ghe_remote_version_required "$host"

# Remote sync-in-progress file location. When this file exists, Git GC
# operations are disabled on the GitHub instance.
sync_in_progress_file="$GHE_REMOTE_DATA_USER_DIR/repositories/.sync_in_progress"

# Make sure root backup dir exists if this is the first run
mkdir -p "$backup_dir"

# Removes the remote sync-in-progress file on exit, re-enabling GC operations
# on the remote instance.
cleanup() {
  ghe-ssh "$host" -- "sudo rm -f '$sync_in_progress_file'"
}
trap 'cleanup' EXIT
trap 'exit $?' INT # ^C always terminate

# Touch the sync-in-progress file, disabling GC operations, and wait for all
# active GC processes to finish on the remote side.
echo "
    set -e
    sudo -u git touch '$sync_in_progress_file'

    sanity=0
    while [ \$sanity -lt $GHE_GIT_COOLDOWN_PERIOD ]; do
        # note: the bracket synta[x] below is to prevent matches against the
        # grep process itself.
        if ps axo args | grep -E -e '^git( -.*)? nw-repac[k]( |$)' -e '^git( -.*)? g[c]( |$)' >/dev/null; then
            sleep 1
            sanity=\$(( sanity + 1 ))
        else
            exit 0
        fi
    done
    exit 7
" | ghe-ssh "$host" -- /bin/sh || {
    res=$?
    if [ $res = 7 ]; then
        echo "Error: Git GC processes remain after $GHE_GIT_COOLDOWN_PERIOD seconds. Aborting..." 1>&2
    fi
    exit $res
}

# Transfer repository data from a GitHub instance to the current snapshot
# directory, using a previous snapshot to avoid transferring files that have
# already been transferred. A set of rsync filter rules are provided on stdin
# for each invocation.
rsync_repository_data () {
    ghe-rsync -av \
        -e "ghe-ssh -p $(ssh_port_part "$host")" \
        $link_dest "$@" \
        --rsync-path='sudo -u git rsync' \
        --include-from=- --exclude=\* \
        "$(ssh_host_part "$host"):$GHE_REMOTE_DATA_USER_DIR/repositories/" \
        "$backup_dir" 1>&3
}

# If we have a previous increment, avoid transferring existing files via rsync's
# --link-dest support. This also decreases physical space usage considerably.
if [ -d "$backup_current" ]; then
    link_dest="--link-dest=../../current/repositories"
fi

# Sync all auxiliary repository data. This includes files and directories like
# HEAD, audit_log, config, description, info/, etc. No refs or object data
# should be transferred here.
echo 1>&3
echo "* Transferring auxiliary files ..." 1>&3
rsync_repository_data -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
- /*/*.git/objects
- /*/*.git/refs
- /*/*.git/packed-refs
- /*/*.git/logs
+ /*/*.git/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
- /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/refs
- /*/??/??/??/gist/*.git/packed-refs
- /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/**

+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
- /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/refs
- /*/nw/??/??/??/*/*.git/packed-refs
- /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/**
RULES

# Sync packed refs files. This is performed before sync'ing loose refs since
# loose refs trump packed-refs information.
echo 1>&3
echo "* Transferring packed-refs files ..." 1>&3
rsync_repository_data -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/packed-refs

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/packed-refs

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/packed-refs
RULES

# Sync loose refs and reflogs. This must be performed before object data is
# transferred to ensure that all referenced objects are included.
echo 1>&3
echo "* Transferring refs and reflogs ..."  1>&3
rsync_repository_data -z <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/refs
+ /*/*.git/refs/**
+ /*/*.git/logs
+ /*/*.git/logs/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/refs
+ /*/??/??/??/gist/*.git/refs/**
+ /*/??/??/??/gist/*.git/logs
+ /*/??/??/??/gist/*.git/logs/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/refs
+ /*/nw/??/??/??/*/*.git/refs/**
+ /*/nw/??/??/??/*/*.git/logs
+ /*/nw/??/??/??/*/*.git/logs/**
RULES

# Sync git objects and pack files. Compression is disabled during this phase
# since these files are already well compressed.
echo 1>&3
echo "* Transferring objects and packs ..." 1>&3
rsync_repository_data -H <<RULES
- /__*__/
- /info/

+ /*/
+ /*/*.git
+ /*/*.git/objects
- /*/*.git/objects/**/tmp_*
+ /*/*.git/objects/**

+ /*/??/
+ /*/??/??/
+ /*/??/??/??/
+ /*/??/??/??/gist/
+ /*/??/??/??/gist/*.git
+ /*/??/??/??/gist/*.git/objects
- /*/??/??/??/gist/*.git/objects/**/tmp_*
+ /*/??/??/??/gist/*.git/objects/**

+ /*/nw/??/
+ /*/nw/??/??/
+ /*/nw/??/??/??/
+ /*/nw/??/??/??/*/
+ /*/nw/??/??/??/*/*.git
+ /*/nw/??/??/??/*/*.git/objects
- /*/nw/??/??/??/*/*.git/objects/**/tmp_*
+ /*/nw/??/??/??/*/*.git/objects/**
RULES

# Sync __special__ data directories, including the __alambic_assets__,
# __hookshot__, and __purgatory__ directories. The __nodeload_archives__,
# __gitmon__, and __render__ directories are excludes since they act only as
# caches.
#
# Under v2.x and greater, only the special __purgatory__ directory remains under
# /data/repositories. All other special user data directories have been moved under
# the /data/user directory.
echo 1>&3
echo "* Transferring special data directories ..." 1>&3
rsync_repository_data <<RULES
- /__nodeload_archives__/
- /__gitmon__/
- /__render__/
+ /__*__/
+ /__*__/**
+ /info/
- /info/lost+found/
+ /info/*
RULES
echo 1>&3
