| #!/usr/bin/env bash |
| # Copyright (c) 2012, Code Aurora Forum. All rights reserved. |
| # |
| # Redistribution and use in source and binary forms, with or without |
| # modification, are permitted provided that the following conditions are |
| # met: |
| # # Redistributions of source code must retain the above copyright |
| # notice, this list of conditions and the following disclaimer. |
| # # Redistributions in binary form must reproduce the above |
| # copyright notice, this list of conditions and the following |
| # disclaimer in the documentation and/or other materials provided |
| # with the distribution. |
| # # Neither the name of Code Aurora Forum, Inc. nor the names of its |
| # contributors may be used to endorse or promote products derived |
| # from this software without specific prior written permission. |
| # |
| # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED |
| # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
| # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT |
| # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS |
| # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
| # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
| # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
| # OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN |
| # IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| usage() { # error_message |
| |
| cat <<-EOF |
| usage: $(basename $0) [-unvt] [--noref] [--nolosse] [-r|--ratio number] |
| [git gc option...] git.repo |
| |
| -u|-h usage/help |
| -v verbose |
| -n dry-run don't actually repack anything |
| -t touch treat repo as if it had been touched |
| --noref avoid extra ref packing timestamp checking |
| --noloose do not run just because there are loose object dirs |
| (repacking may still run if they are referenced) |
| -r ratio <number> packfile ratio to aim for (default 10) |
| |
| git gc option will be passed as args to git gc |
| |
| git.repo to run gc against |
| |
| Garbage collect using a pseudo logarithmic packfile maintenance |
| approach. This approach attempts to minimize packfile churn |
| by keeping several generations of varying sized packfiles around |
| and only consolidating packfiles (or loose objects) which are |
| either new packfiles, or packfiles close to the same size as |
| another packfile. |
| |
| An estimate is used to predict when rollups (one consolidation |
| would cause another consolidation) would occur so that this |
| rollup can be done all at once via a single repack. This reduces |
| both the runtime and the pack file churn in rollup cases. |
| |
| Approach: plan each consolidation by creating a table like this: |
| |
| Id Keep Size Sha1(or consolidation list) Actions(repack down up note) |
| 1 - 11356 9052edfb7392646cd4e5f362b953675985f01f96 y - - New |
| 2 - 429088 010904d5c11cd26a79fda91b01ab454d1001b402 y - - New |
| c1 - 440444 [1,2] - - - |
| |
| Id: numbers preceded by a c are estimated "c pack" files |
| Keep: - none, k private keep, o our keep |
| Size: in disk blocks (default du output) |
| Sha1: of packfile, or consolidation list of packfile ids |
| Actions |
| repack: - n no, y yes |
| down: - noop, ^ consolidate with a file above |
| up: - noop, v consolidate with a file below |
| note: Human description of script decisions: |
| New (file is a new packfile) |
| Consolidate with:<list of packfile ids> |
| (too far from:<list of packfile ids>) |
| |
| On the first pass, always consolidate any new packfiles along |
| with loose objects and along with any packfiles which are within |
| the ratio size of their predecessors (note, the list is ordered |
| by increasing size). After each consolidation, insert a fake |
| consolidation, or "c pack", to naively represent the size and |
| ordered positioning of the anticipated new consolidated pack. |
| Every time a new pack is planned, rescan the list in case the |
| new "c pack" would cause more consolidation... |
| |
| Once the packfiles which need consolidation are determined, the |
| packfiles which will not be consolidated are marked with a .keep |
| file, and those which will be consolidated will have their .keep |
| removed if they have one. Thus, the packfiles with a .keep will |
| not get repacked. |
| |
| Packfile consolidation is determined by the --ratio parameter |
| (default is 10). This ratio is somewhat of a tradeoff. The |
| smaller the number, the more packfiles will be kept on average; |
| this increases disk utilization somewhat. However, a larger |
| ratio causes greater churn and may increase disk utilization due |
| to deleted packfiles not being reclaimed since they may still be |
| kept open by long running applications such as Gerrit. Sane |
| ratio values are probably between 2 and 10. Since most |
| consolidations actually end up smaller than the estimated |
| consolidated packfile size (due to compression), the true ratio |
| achieved will likely be 1 to 2 greater than the target ratio. |
| The smaller the target ratio, the greater this discrepancy. |
| |
| Finally, attempt to skip garbage collection entirely on untouched |
| repos. In order to determine if a repo has been touched, use the |
| timestamp on the script's keep files, if any relevant file/dir |
| is newer than a keep marker file, assume that the repo has been |
| touched and gc needs to run. Also assume gc needs to run whenever |
| there are loose object dirs since they may contain untouched |
| unreferenced loose objects which need to be pruned (once they |
| expire). |
| |
| In order to allow the keep files to be an effective timestamp |
| marker to detect relevant changes in a repo since the last run, |
| all relevant files and directories which may be modified during a |
| gc run (even during a noop gc run), must have their timestamps |
| reset to the same time as the keep files or gc will always run |
| even on untouched repos. The relevant files/dirs are all those |
| files and directories which garbage collection, object packing, |
| ref packing and pruning might change during noop actions. |
| EOF |
| |
| [ -n "$1" ] && info "ERROR $1" |
| |
| exit 128 |
| } |
| |
| debug() { [ -n "$SW_V" ] && info "$1" ; } |
| info() { echo "$1" >&2 ; } |
| |
| array_copy() { #v2 # array_src array_dst |
| local src=$1 dst=$2 |
| local s i=0 |
| eval s=\${#$src[@]} |
| while [ $i -lt $s ] ; do |
| eval $dst[$i]=\"\${$src[$i]}\" |
| i=$(($i + 1)) |
| done |
| } |
| |
| array_equals() { #v2 # array_name [vals...] |
| local a=$1 ; shift |
| local s=0 t=() val |
| array_copy "$a" t |
| for s in "${!t[@]}" ; do s=$((s+1)) ; done |
| [ "$s" -ne "$#" ] && return 1 |
| for val in "${t[@]}" ; do |
| [ "$val" = "$1" ] || return 2 |
| shift |
| done |
| return 0 |
| } |
| |
| packs_sizes() { # git.repo > "size pack"... |
| du -s "$1"/objects/pack/pack-$SHA1.pack | sort -n 2> /dev/null |
| } |
| |
| is_ourkeep() { grep -q "$KEEP" "$1" 2> /dev/null ; } # keep |
| has_ourkeep() { is_ourkeep "$(keep_for "$1")" ; } # pack |
| has_keep() { [ -f "$(keep_for "$1")" ] ; } # pack |
| is_repo() { [ -d "$1/objects" ] && [ -d "$1/refs/heads" ] ; } # git.repo |
| |
| keep() { # pack # returns true if we added our keep |
| keep=$(keep_for "$1") |
| [ -f "$keep" ] && return 1 |
| echo "$KEEP" > "$keep" |
| return 0 |
| } |
| |
| keep_for() { # packfile > keepfile |
| local keep=$(echo "$1" | sed -es'/\.pack$/.keep/') |
| [ "${keep/.keep}" = "$keep" ] && return 1 |
| echo "$keep" |
| } |
| |
| idx_for() { # packfile > idxfile |
| local idx=$(echo "$1" | sed -es'/\.pack$/.idx/') |
| [ "${idx/.idx}" = "$idx" ] && return 1 |
| echo "$idx" |
| } |
| |
| # pack_or_keep_file > sha |
| sha_for() { echo "$1" | sed -es'|\(.*/\)*pack-\([^.]*\)\..*$|\2|' ; } |
| |
| private_keeps() { # git.repo -> sets pkeeps |
| local repo=$1 ary=$2 |
| local keep keeps=("$repo"/objects/pack/pack-$SHA1.keep) |
| pkeeps=() |
| for keep in "${keeps[@]}" ; do |
| is_ourkeep "$keep" || pkeeps=("${pkeeps[@]}" "$keep") |
| done |
| } |
| |
| is_tooclose() { [ "$(($1 * $RATIO))" -gt "$2" ] ; } # smaller larger |
| |
| unique() { # [args...] > unique_words |
| local lines=$(while [ $# -gt 0 ] ; do echo "$1" ; shift ; done) |
| lines=$(echo "$lines" | sort -u) |
| echo $lines # as words |
| } |
| |
| outfs() { # fs [args...] > argfs... |
| local fs=$1 ; shift |
| [ $# -gt 0 ] && echo -n "$1" ; shift |
| while [ $# -gt 0 ] ; do echo -n "$fs$1" ; shift ; done |
| } |
| |
| sort_list() { # < list > formatted_list |
| # n has_keep size sha repack down up note |
| awk '{ note=$8; for(i=8;i<NF;i++) note=note " "$(i+1) |
| printf("%-5s %s %-14s %-40s %s %s %s %s\n", \ |
| $1,$2, $3, $4, $5,$6,$7,note)}' |\ |
| sort -k 3,3n -k 1,1n |
| } |
| |
| is_touched() { # git.repo |
| local repo=$1 |
| local loose keep ours newer |
| [ -n "$SW_T" ] && { debug "$SW_T -> treat as touched" ; return 0 ; } |
| |
| if [ -z "$SW_LOOSE" ] ; then |
| # If there are loose objects, they may need to be pruned, |
| # run even if nothing has really been touched. |
| loose=$(find "$repo/objects" -type d \ |
| -wholename "$repo/objects/[0-9][0-9]" |
| -print -quit 2>/dev/null) |
| [ -n "$loose" ] && { info "There are loose object directories" ; return 0 ; } |
| fi |
| |
| # If we don't have a keep, the current packfiles may not have been |
| # compressed with the current gc policy (gc may never have been run), |
| # so run at least once to repack everything. Also, we need a marker |
| # file for timestamp tracking (a dir needs to detect changes within |
| # it, so it cannot be a marker) and our keeps are something we control, |
| # use them. |
| for keep in "$repo"/objects/pack/pack-$SHA1.keep ; do |
| is_ourkeep "$keep" && { ours=$keep ; break ; } |
| done |
| [ -z "$ours" ] && { info 'We have no keep (we have never run?): run' ; return 0 ; } |
| |
| debug "Our timestamp keep: $ours" |
| # The wholename stuff seems to get touched by a noop git gc |
| newer=$(find "$repo/objects" "$repo/refs" "$repo/packed-refs" \ |
| '!' -wholename "$repo/objects/info" \ |
| '!' -wholename "$repo/objects/info/*" \ |
| -newer "$ours" \ |
| -print -quit 2>/dev/null) |
| [ -z "$newer" ] && return 1 |
| |
| info "Touched since last run: $newer" |
| return 0 |
| } |
| |
| touch_refs() { # git.repo start_date refs |
| local repo=$1 start_date=$2 refs=$3 |
| ( |
| debug "Setting start date($start_date) on unpacked refs:" |
| debug "$refs" |
| cd "$repo/refs" || return |
| # safe to assume no newlines in a ref name |
| echo "$refs" | xargs -d '\n' -n 1 touch -c -d "$start_date" |
| ) |
| } |
| |
| set_start_date() { # git.repo start_date refs refdirs packedrefs [packs] |
| local repo=$1 start_date=$2 refs=$3 refdirs=$4 packedrefs=$5 ; shift 5 |
| local pack keep idx repacked |
| |
| # This stuff is touched during object packs |
| while [ $# -gt 0 ] ; do |
| pack=$1 ; shift |
| keep="$(keep_for "$pack")" |
| idx="$(idx_for "$pack")" |
| touch -c -d "$start_date" "$pack" "$keep" "$idx" |
| debug "Setting start date on: $pack $keep $idx" |
| done |
| # This will prevent us from detecting any deletes in the pack dir |
| # since gc ran, except for private keeps which we are checking |
| # manually. But there really shouldn't be any other relevant deletes |
| # in this dir which should cause us to rerun next time, deleting a |
| # pack or index file by anything but gc would be bad! |
| debug "Setting start date on pack dir: $start_date" |
| touch -c -d "$start_date" "$repo/objects/pack" |
| |
| |
| if [ -z "$SW_REFS" ] ; then |
| repacked=$(find "$repo/packed-refs" -newer "$repo/objects/pack" |
| -print -quit 2>/dev/null) |
| if [ -n "$repacked" ] ; then |
| # The ref dirs and packed-ref files seem to get touched even on |
| # a noop refpacking |
| debug "Setting start date on packed-refs" |
| touch -c -d "$start_date" "$repo/packed-refs" |
| touch_refs "$repo" "$start_date" "$refdirs" |
| |
| # A ref repack does not imply a ref change, but since it is |
| # hard to tell, simply assume so |
| if [ "$refs" != "$(cd "$repo/refs" ; find -depth)" ] || \ |
| [ "$packedrefs" != "$(<"$repo/packed-refs")" ] ; then |
| # We retouch if needed (instead of simply checking then |
| # touching) to avoid a race between the check and the set. |
| debug " but refs actually got packed, so retouch packed-refs" |
| touch -c "$repo/packed-refs" |
| fi |
| fi |
| fi |
| } |
| |
| note_consolidate() { # note entry > note (no duplicated consolidated entries) |
| local note=$1 entry=$2 |
| local entries=() ifs=$IFS |
| if echo "$note" | grep -q 'Consolidate with:[0-9,c]' ; then |
| IFS=, |
| entries=( $(echo "$note" | sed -es'/^.*Consolidate with:\([0-9,c]*\).*$/\1/') ) |
| note=( $(echo "$note" | sed -es'/Consolidate with:[0-9,c]*//') ) |
| IFS=$ifs |
| fi |
| entries=( $(unique "${entries[@]}" "$entry") ) |
| echo "$note Consolidate with:$(outfs , "${entries[@]}")" |
| } |
| |
| note_toofar() { # note entry > note (no duplicated "too far" entries) |
| local note=$1 entry=$2 |
| local entries=() ifs=$IFS |
| if echo "$note" | grep -q '(too far from:[0-9,c]*)' ; then |
| IFS=, |
| entries=( $(echo "$note" | sed -es'/^.*(too far from:\([0-9,c]*\)).*$/\1/') ) |
| note=( $(echo "$note" | sed -es'/(too far from:[0-9,c]*)//') ) |
| IFS=$ifs |
| fi |
| entries=( $(unique "${entries[@]}" "$entry") ) |
| echo "$note (too far from:$(outfs , "${entries[@]}"))" |
| } |
| |
| last_entry() { # isRepack pline repackline > last_rows_entry |
| local size_hit=$1 pline=$2 repackline=$3 |
| if [ -n "$pline" ] ; then |
| if [ -n "$size_hit" ] ; then |
| echo "$repack_line" |
| else |
| echo "$pline" |
| fi |
| fi |
| } |
| |
| init_list() { # git.repo > shortlist |
| local repo=$1 |
| local file |
| local n has_keep size sha repack |
| |
| packs_sizes "$1" | { |
| while read size file ; do |
| n=$((n+1)) |
| repack=n |
| has_keep=- |
| if has_keep "$file" ; then |
| has_keep=k |
| has_ourkeep "$file" && has_keep=o |
| fi |
| sha=$(sha_for "$file") |
| echo "$n $has_keep $size $sha $repack" |
| done |
| } | sort_list |
| } |
| |
| consolidate_list() { # run < list > list |
| local run=$1 |
| local sum=0 psize=0 sum_size=0 size_hit pn clist pline repackline |
| local n has_keep size sha repack down up note |
| |
| { |
| while read n has_keep size sha repack down up note; do |
| [ -z "$up" ] && up='-' |
| [ -z "$down" ] && down="-" |
| |
| if [ "$has_keep" = "k" ] ; then |
| echo "$n $has_keep $size $sha $repack - - Private" |
| continue |
| fi |
| |
| if [ "$repack" = "n" ] ; then |
| if is_tooclose $psize $size ; then |
| size_hit=y |
| repack=y |
| sum=$(($sum + $sum_size + $size)) |
| sum_size=0 # Prevents double summing this entry |
| clist=($(unique "${clist[@]}" $pn $n)) |
| down="^" |
| [ "$has_keep" = "-" ] && note="$note New +" |
| note=$(note_consolidate "$note" "$pn") |
| elif [ "$has_keep" = "-" ] ; then |
| repack=y |
| sum=$(($sum + $size)) |
| sum_size=0 # Prevents double summing this entry |
| clist=($(unique "${clist[@]}" $n)) |
| note="$note New" |
| elif [ $psize -ne 0 ] ; then |
| sum_size=$size |
| down="!" |
| note=$(note_toofar "$note" "$pn") |
| else |
| sum_size=$size |
| fi |
| else |
| sum_size=$size |
| fi |
| |
| # By preventing "c files" (consolidated) from being marked |
| # "repack" they won't get keeps |
| repack2=y |
| [ "${n/c}" != "$n" ] && { repack=- ; repack2=- ; } |
| |
| last_entry "$size_hit" "$pline" "$repack_line" |
| # Delay the printout until we know whether we are |
| # being consolidated with the entry following us |
| # (we won't know until the next iteration). |
| # size_hit is used to determine which of the lines |
| # below will actually get printed above on the next |
| # iteration. |
| pline="$n $has_keep $size $sha $repack $down $up $note" |
| repack_line="$n $has_keep $size $sha $repack2 $down v $note" |
| |
| pn=$n ; psize=$size # previous entry data |
| size_hit='' # will not be consolidated up |
| |
| done |
| last_entry "$size_hit" "$pline" "$repack_line" |
| |
| [ $sum -gt 0 ] && echo "c$run - $sum [$(outfs , "${clist[@]}")] - - -" |
| |
| } | sort_list |
| } |
| |
| process_list() { # git.repo > list |
| local list=$(init_list "$1") plist run=0 |
| |
| while true ; do |
| plist=$list |
| run=$((run +1)) |
| list=$(echo "$list" | consolidate_list "$run") |
| if [ "$plist" != "$list" ] ; then |
| debug "------------------------------------------------------------------------------------" |
| debug "$HEADER" |
| debug "$list" |
| else |
| break |
| fi |
| done |
| debug "------------------------------------------------------------------------------------" |
| echo "$list" |
| } |
| |
| repack_list() { # git.repo < list |
| local repo=$1 |
| local start_date newpacks=0 pkeeps keeps=1 refs refdirs rtn |
| local packedrefs=$(<"$repo/packed-refs") |
| |
| # so they don't appear touched after a noop refpacking |
| if [ -z "$SW_REFS" ] ; then |
| refs=$(cd "$repo/refs" ; find -depth) |
| refdirs=$(cd "$repo/refs" ; find -type d -depth) |
| debug "Before refs:" |
| debug "$refs" |
| fi |
| |
| # Find a private keep snapshot which has not changed from |
| # before our start_date so private keep deletions during gc |
| # can be detected |
| while ! array_equals pkeeps "${keeps[@]}" ; do |
| debug "Getting a private keep snapshot" |
| private_keeps "$repo" |
| keeps=("${pkeeps[@]}") |
| debug "before keeps: ${keeps[*]}" |
| start_date=$(date) |
| private_keeps "$repo" |
| debug "after keeps: ${pkeeps[*]}" |
| done |
| |
| while read n has_keep size sha repack down up note; do |
| if [ "$repack" = "y" ] ; then |
| keep="$repo/objects/pack/pack-$sha.keep" |
| info "Repacking $repo/objects/pack/pack-$sha.pack" |
| [ -f "$keep" ] && rm -f "$keep" |
| fi |
| done |
| |
| ( cd "$repo" && git gc "${GC_OPTS[@]}" ) ; rtn=$? |
| |
| # Mark any files withoug a .keep with our .keep |
| packs=("$repo"/objects/pack/pack-$SHA1.pack) |
| for pack in "${packs[@]}" ; do |
| if keep "$pack" ; then |
| info "New pack: $pack" |
| newpacks=$((newpacks+1)) |
| fi |
| done |
| |
| # Record start_time. If there is more than 1 new packfile, we |
| # don't want to risk touching it with an older date since that |
| # would prevent consolidation on the next run. If the private |
| # keeps have changed, then we should run next time no matter what. |
| if [ $newpacks -le 1 ] || ! array_equals pkeeps "${keeps[@]}" ; then |
| set_start_date "$repo" "$start_date" "$refs" "$refdirs" "$packedrefs" "${packs[@]}" |
| fi |
| |
| return $rtn # we really only care about the gc error code |
| } |
| |
| git_gc() { # git.repo |
| local list=$(process_list "$1") |
| if [ -z "$SW_V" ] ; then |
| info "Running $PROG on $1. git gc options: ${GC_OPTS[@]}" |
| echo "$HEADER" >&2 |
| echo "$list" >&2 ; |
| fi |
| echo "$list" | repack_list "$1" |
| } |
| |
| |
| PROG=$(basename "$0") |
| HEADER="Id Keep Size Sha1(or consolidation list) Actions(repack down up note)" |
| KEEP=git-exproll |
| HEX='[0-9a-f]' |
| HEX10=$HEX$HEX$HEX$HEX$HEX$HEX$HEX$HEX$HEX$HEX |
| SHA1=$HEX10$HEX10$HEX10$HEX10 |
| |
| RATIO=10 |
| SW_N='' ; SW_V='' ; SW_T='' ; SW_REFS='' ; SW_LOOSE='' ; GC_OPTS=() |
| while [ $# -gt 0 ] ; do |
| case "$1" in |
| -u|-h) usage ;; |
| -n) SW_N="$1" ;; |
| -v) SW_V="$1" ;; |
| |
| -t) SW_T="$1" ;; |
| --norefs) SW_REFS="$1" ;; |
| --noloose) SW_LOOSE="$1" ;; |
| |
| -r|--ratio) shift ; RATIO="$1" ;; |
| |
| *) [ $# -le 1 ] && break |
| GC_OPTS=( "${GC_OPTS[@]}" "$1" ) |
| ;; |
| esac |
| shift |
| done |
| |
| |
| REPO="$1" |
| if ! is_repo "$REPO" ; then |
| REPO=$REPO/.git |
| is_repo "$REPO" || usage "($1) is not likely a git repo" |
| fi |
| |
| |
| if [ -z "$SW_N" ] ; then |
| is_touched "$REPO" || { info "Repo untouched since last run" ; exit ; } |
| git_gc "$REPO" |
| else |
| is_touched "$REPO" || info "Repo untouched since last run, analyze anyway." |
| process_list "$REPO" >&2 |
| fi |