blob: 9ad7a85189c2eeb63b5bcf203ae5ea973974abb1 [file] [log] [blame]
Dan McGregor26c0d092015-08-26 09:03:24 -06001#!/usr/bin/env bash
Martin Fick43a4e172012-05-31 14:14:56 -06002# Copyright (c) 2012, Code Aurora Forum. All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7# # Redistributions of source code must retain the above copyright
8# notice, this list of conditions and the following disclaimer.
9# # Redistributions in binary form must reproduce the above
10# copyright notice, this list of conditions and the following
11# disclaimer in the documentation and/or other materials provided
12# with the distribution.
13# # Neither the name of Code Aurora Forum, Inc. nor the names of its
14# contributors may be used to endorse or promote products derived
15# from this software without specific prior written permission.
16#
17# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
18# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
20# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
21# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
24# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
26# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29usage() { # error_message
30
31 cat <<-EOF
32 usage: $(basename $0) [-unvt] [--noref] [--nolosse] [-r|--ratio number]
33 [git gc option...] git.repo
34
35 -u|-h usage/help
36 -v verbose
37 -n dry-run don't actually repack anything
38 -t touch treat repo as if it had been touched
39 --noref avoid extra ref packing timestamp checking
40 --noloose do not run just because there are loose object dirs
41 (repacking may still run if they are referenced)
42 -r ratio <number> packfile ratio to aim for (default 10)
43
44 git gc option will be passed as args to git gc
45
46 git.repo to run gc against
47
48 Garbage collect using a pseudo logarithmic packfile maintenance
49 approach. This approach attempts to minimize packfile churn
50 by keeping several generations of varying sized packfiles around
51 and only consolidating packfiles (or loose objects) which are
52 either new packfiles, or packfiles close to the same size as
53 another packfile.
54
55 An estimate is used to predict when rollups (one consolidation
56 would cause another consolidation) would occur so that this
57 rollup can be done all at once via a single repack. This reduces
58 both the runtime and the pack file churn in rollup cases.
59
60 Approach: plan each consolidation by creating a table like this:
61
62 Id Keep Size Sha1(or consolidation list) Actions(repack down up note)
63 1 - 11356 9052edfb7392646cd4e5f362b953675985f01f96 y - - New
64 2 - 429088 010904d5c11cd26a79fda91b01ab454d1001b402 y - - New
65 c1 - 440444 [1,2] - - -
66
67 Id: numbers preceded by a c are estimated "c pack" files
68 Keep: - none, k private keep, o our keep
69 Size: in disk blocks (default du output)
70 Sha1: of packfile, or consolidation list of packfile ids
71 Actions
72 repack: - n no, y yes
73 down: - noop, ^ consolidate with a file above
74 up: - noop, v consolidate with a file below
75 note: Human description of script decisions:
76 New (file is a new packfile)
77 Consolidate with:<list of packfile ids>
78 (too far from:<list of packfile ids>)
79
80 On the first pass, always consolidate any new packfiles along
81 with loose objects and along with any packfiles which are within
82 the ratio size of their predecessors (note, the list is ordered
83 by increasing size). After each consolidation, insert a fake
84 consolidation, or "c pack", to naively represent the size and
85 ordered positioning of the anticipated new consolidated pack.
86 Every time a new pack is planned, rescan the list in case the
87 new "c pack" would cause more consolidation...
88
89 Once the packfiles which need consolidation are determined, the
90 packfiles which will not be consolidated are marked with a .keep
91 file, and those which will be consolidated will have their .keep
92 removed if they have one. Thus, the packfiles with a .keep will
93 not get repacked.
94
95 Packfile consolidation is determined by the --ratio parameter
96 (default is 10). This ratio is somewhat of a tradeoff. The
97 smaller the number, the more packfiles will be kept on average;
98 this increases disk utilization somewhat. However, a larger
99 ratio causes greater churn and may increase disk utilization due
100 to deleted packfiles not being reclaimed since they may still be
101 kept open by long running applications such as Gerrit. Sane
102 ratio values are probably between 2 and 10. Since most
103 consolidations actually end up smaller than the estimated
104 consolidated packfile size (due to compression), the true ratio
105 achieved will likely be 1 to 2 greater than the target ratio.
106 The smaller the target ratio, the greater this discrepancy.
107
108 Finally, attempt to skip garbage collection entirely on untouched
109 repos. In order to determine if a repo has been touched, use the
110 timestamp on the script's keep files, if any relevant file/dir
111 is newer than a keep marker file, assume that the repo has been
112 touched and gc needs to run. Also assume gc needs to run whenever
113 there are loose object dirs since they may contain untouched
114 unreferenced loose objects which need to be pruned (once they
115 expire).
116
117 In order to allow the keep files to be an effective timestamp
118 marker to detect relevant changes in a repo since the last run,
119 all relevant files and directories which may be modified during a
120 gc run (even during a noop gc run), must have their timestamps
121 reset to the same time as the keep files or gc will always run
122 even on untouched repos. The relevant files/dirs are all those
123 files and directories which garbage collection, object packing,
124 ref packing and pruning might change during noop actions.
125EOF
126
127 [ -n "$1" ] && info "ERROR $1"
128
Brian Harringc068f332012-12-23 04:00:29 -0800129 exit 128
Martin Fick43a4e172012-05-31 14:14:56 -0600130}
131
132debug() { [ -n "$SW_V" ] && info "$1" ; }
133info() { echo "$1" >&2 ; }
134
135array_copy() { #v2 # array_src array_dst
136 local src=$1 dst=$2
137 local s i=0
138 eval s=\${#$src[@]}
139 while [ $i -lt $s ] ; do
140 eval $dst[$i]=\"\${$src[$i]}\"
141 i=$(($i + 1))
142 done
143}
144
145array_equals() { #v2 # array_name [vals...]
146 local a=$1 ; shift
147 local s=0 t=() val
148 array_copy "$a" t
149 for s in "${!t[@]}" ; do s=$((s+1)) ; done
150 [ "$s" -ne "$#" ] && return 1
151 for val in "${t[@]}" ; do
152 [ "$val" = "$1" ] || return 2
153 shift
154 done
155 return 0
156}
157
158packs_sizes() { # git.repo > "size pack"...
159 du -s "$1"/objects/pack/pack-$SHA1.pack | sort -n 2> /dev/null
160}
161
162is_ourkeep() { grep -q "$KEEP" "$1" 2> /dev/null ; } # keep
163has_ourkeep() { is_ourkeep "$(keep_for "$1")" ; } # pack
164has_keep() { [ -f "$(keep_for "$1")" ] ; } # pack
165is_repo() { [ -d "$1/objects" ] && [ -d "$1/refs/heads" ] ; } # git.repo
166
167keep() { # pack # returns true if we added our keep
168 keep=$(keep_for "$1")
169 [ -f "$keep" ] && return 1
170 echo "$KEEP" > "$keep"
171 return 0
172}
173
174keep_for() { # packfile > keepfile
175 local keep=$(echo "$1" | sed -es'/\.pack$/.keep/')
176 [ "${keep/.keep}" = "$keep" ] && return 1
177 echo "$keep"
178}
179
180idx_for() { # packfile > idxfile
181 local idx=$(echo "$1" | sed -es'/\.pack$/.idx/')
182 [ "${idx/.idx}" = "$idx" ] && return 1
183 echo "$idx"
184}
185
186# pack_or_keep_file > sha
187sha_for() { echo "$1" | sed -es'|\(.*/\)*pack-\([^.]*\)\..*$|\2|' ; }
188
189private_keeps() { # git.repo -> sets pkeeps
190 local repo=$1 ary=$2
191 local keep keeps=("$repo"/objects/pack/pack-$SHA1.keep)
192 pkeeps=()
193 for keep in "${keeps[@]}" ; do
194 is_ourkeep "$keep" || pkeeps=("${pkeeps[@]}" "$keep")
195 done
196}
197
198is_tooclose() { [ "$(($1 * $RATIO))" -gt "$2" ] ; } # smaller larger
199
200unique() { # [args...] > unique_words
201 local lines=$(while [ $# -gt 0 ] ; do echo "$1" ; shift ; done)
202 lines=$(echo "$lines" | sort -u)
203 echo $lines # as words
204}
205
206outfs() { # fs [args...] > argfs...
207 local fs=$1 ; shift
208 [ $# -gt 0 ] && echo -n "$1" ; shift
209 while [ $# -gt 0 ] ; do echo -n "$fs$1" ; shift ; done
210}
211
212sort_list() { # < list > formatted_list
213 # n has_keep size sha repack down up note
214 awk '{ note=$8; for(i=8;i<NF;i++) note=note " "$(i+1)
215 printf("%-5s %s %-14s %-40s %s %s %s %s\n", \
216 $1,$2, $3, $4, $5,$6,$7,note)}' |\
217 sort -k 3,3n -k 1,1n
218}
219
220is_touched() { # git.repo
221 local repo=$1
222 local loose keep ours newer
223 [ -n "$SW_T" ] && { debug "$SW_T -> treat as touched" ; return 0 ; }
224
225 if [ -z "$SW_LOOSE" ] ; then
226 # If there are loose objects, they may need to be pruned,
227 # run even if nothing has really been touched.
228 loose=$(find "$repo/objects" -type d \
229 -wholename "$repo/objects/[0-9][0-9]"
230 -print -quit 2>/dev/null)
231 [ -n "$loose" ] && { info "There are loose object directories" ; return 0 ; }
232 fi
233
234 # If we don't have a keep, the current packfiles may not have been
235 # compressed with the current gc policy (gc may never have been run),
236 # so run at least once to repack everything. Also, we need a marker
237 # file for timestamp tracking (a dir needs to detect changes within
238 # it, so it cannot be a marker) and our keeps are something we control,
239 # use them.
240 for keep in "$repo"/objects/pack/pack-$SHA1.keep ; do
241 is_ourkeep "$keep" && { ours=$keep ; break ; }
242 done
243 [ -z "$ours" ] && { info 'We have no keep (we have never run?): run' ; return 0 ; }
244
245 debug "Our timestamp keep: $ours"
246 # The wholename stuff seems to get touched by a noop git gc
247 newer=$(find "$repo/objects" "$repo/refs" "$repo/packed-refs" \
248 '!' -wholename "$repo/objects/info" \
249 '!' -wholename "$repo/objects/info/*" \
250 -newer "$ours" \
251 -print -quit 2>/dev/null)
252 [ -z "$newer" ] && return 1
253
254 info "Touched since last run: $newer"
255 return 0
256}
257
258touch_refs() { # git.repo start_date refs
259 local repo=$1 start_date=$2 refs=$3
260 (
261 debug "Setting start date($start_date) on unpacked refs:"
262 debug "$refs"
263 cd "$repo/refs" || return
264 # safe to assume no newlines in a ref name
265 echo "$refs" | xargs -d '\n' -n 1 touch -c -d "$start_date"
266 )
267}
268
269set_start_date() { # git.repo start_date refs refdirs packedrefs [packs]
270 local repo=$1 start_date=$2 refs=$3 refdirs=$4 packedrefs=$5 ; shift 5
271 local pack keep idx repacked
272
273 # This stuff is touched during object packs
274 while [ $# -gt 0 ] ; do
275 pack=$1 ; shift
276 keep="$(keep_for "$pack")"
277 idx="$(idx_for "$pack")"
278 touch -c -d "$start_date" "$pack" "$keep" "$idx"
279 debug "Setting start date on: $pack $keep $idx"
280 done
281 # This will prevent us from detecting any deletes in the pack dir
282 # since gc ran, except for private keeps which we are checking
283 # manually. But there really shouldn't be any other relevant deletes
284 # in this dir which should cause us to rerun next time, deleting a
285 # pack or index file by anything but gc would be bad!
286 debug "Setting start date on pack dir: $start_date"
287 touch -c -d "$start_date" "$repo/objects/pack"
288
289
290 if [ -z "$SW_REFS" ] ; then
291 repacked=$(find "$repo/packed-refs" -newer "$repo/objects/pack"
292 -print -quit 2>/dev/null)
293 if [ -n "$repacked" ] ; then
294 # The ref dirs and packed-ref files seem to get touched even on
295 # a noop refpacking
296 debug "Setting start date on packed-refs"
297 touch -c -d "$start_date" "$repo/packed-refs"
298 touch_refs "$repo" "$start_date" "$refdirs"
299
300 # A ref repack does not imply a ref change, but since it is
301 # hard to tell, simply assume so
302 if [ "$refs" != "$(cd "$repo/refs" ; find -depth)" ] || \
303 [ "$packedrefs" != "$(<"$repo/packed-refs")" ] ; then
304 # We retouch if needed (instead of simply checking then
305 # touching) to avoid a race between the check and the set.
306 debug " but refs actually got packed, so retouch packed-refs"
307 touch -c "$repo/packed-refs"
308 fi
309 fi
310 fi
311}
312
313note_consolidate() { # note entry > note (no duplicated consolidated entries)
314 local note=$1 entry=$2
315 local entries=() ifs=$IFS
316 if echo "$note" | grep -q 'Consolidate with:[0-9,c]' ; then
317 IFS=,
318 entries=( $(echo "$note" | sed -es'/^.*Consolidate with:\([0-9,c]*\).*$/\1/') )
319 note=( $(echo "$note" | sed -es'/Consolidate with:[0-9,c]*//') )
320 IFS=$ifs
321 fi
322 entries=( $(unique "${entries[@]}" "$entry") )
323 echo "$note Consolidate with:$(outfs , "${entries[@]}")"
324}
325
326note_toofar() { # note entry > note (no duplicated "too far" entries)
327 local note=$1 entry=$2
328 local entries=() ifs=$IFS
329 if echo "$note" | grep -q '(too far from:[0-9,c]*)' ; then
330 IFS=,
331 entries=( $(echo "$note" | sed -es'/^.*(too far from:\([0-9,c]*\)).*$/\1/') )
332 note=( $(echo "$note" | sed -es'/(too far from:[0-9,c]*)//') )
333 IFS=$ifs
334 fi
335 entries=( $(unique "${entries[@]}" "$entry") )
336 echo "$note (too far from:$(outfs , "${entries[@]}"))"
337}
338
339last_entry() { # isRepack pline repackline > last_rows_entry
340 local size_hit=$1 pline=$2 repackline=$3
341 if [ -n "$pline" ] ; then
342 if [ -n "$size_hit" ] ; then
343 echo "$repack_line"
344 else
345 echo "$pline"
346 fi
347 fi
348}
349
350init_list() { # git.repo > shortlist
351 local repo=$1
352 local file
353 local n has_keep size sha repack
354
355 packs_sizes "$1" | {
356 while read size file ; do
357 n=$((n+1))
358 repack=n
359 has_keep=-
360 if has_keep "$file" ; then
361 has_keep=k
362 has_ourkeep "$file" && has_keep=o
363 fi
364 sha=$(sha_for "$file")
365 echo "$n $has_keep $size $sha $repack"
366 done
367 } | sort_list
368}
369
370consolidate_list() { # run < list > list
371 local run=$1
372 local sum=0 psize=0 sum_size=0 size_hit pn clist pline repackline
373 local n has_keep size sha repack down up note
374
375 {
376 while read n has_keep size sha repack down up note; do
377 [ -z "$up" ] && up='-'
378 [ -z "$down" ] && down="-"
379
380 if [ "$has_keep" = "k" ] ; then
381 echo "$n $has_keep $size $sha $repack - - Private"
382 continue
383 fi
384
385 if [ "$repack" = "n" ] ; then
386 if is_tooclose $psize $size ; then
387 size_hit=y
388 repack=y
389 sum=$(($sum + $sum_size + $size))
390 sum_size=0 # Prevents double summing this entry
391 clist=($(unique "${clist[@]}" $pn $n))
392 down="^"
393 [ "$has_keep" = "-" ] && note="$note New +"
394 note=$(note_consolidate "$note" "$pn")
395 elif [ "$has_keep" = "-" ] ; then
396 repack=y
397 sum=$(($sum + $size))
398 sum_size=0 # Prevents double summing this entry
399 clist=($(unique "${clist[@]}" $n))
400 note="$note New"
401 elif [ $psize -ne 0 ] ; then
402 sum_size=$size
403 down="!"
404 note=$(note_toofar "$note" "$pn")
405 else
406 sum_size=$size
407 fi
408 else
409 sum_size=$size
410 fi
411
412 # By preventing "c files" (consolidated) from being marked
413 # "repack" they won't get keeps
414 repack2=y
415 [ "${n/c}" != "$n" ] && { repack=- ; repack2=- ; }
416
417 last_entry "$size_hit" "$pline" "$repack_line"
418 # Delay the printout until we know whether we are
419 # being consolidated with the entry following us
420 # (we won't know until the next iteration).
421 # size_hit is used to determine which of the lines
422 # below will actually get printed above on the next
423 # iteration.
424 pline="$n $has_keep $size $sha $repack $down $up $note"
425 repack_line="$n $has_keep $size $sha $repack2 $down v $note"
426
427 pn=$n ; psize=$size # previous entry data
428 size_hit='' # will not be consolidated up
429
430 done
431 last_entry "$size_hit" "$pline" "$repack_line"
432
433 [ $sum -gt 0 ] && echo "c$run - $sum [$(outfs , "${clist[@]}")] - - -"
434
435 } | sort_list
436}
437
438process_list() { # git.repo > list
439 local list=$(init_list "$1") plist run=0
440
441 while true ; do
442 plist=$list
443 run=$((run +1))
444 list=$(echo "$list" | consolidate_list "$run")
445 if [ "$plist" != "$list" ] ; then
446 debug "------------------------------------------------------------------------------------"
447 debug "$HEADER"
448 debug "$list"
449 else
450 break
451 fi
452 done
453 debug "------------------------------------------------------------------------------------"
454 echo "$list"
455}
456
457repack_list() { # git.repo < list
458 local repo=$1
459 local start_date newpacks=0 pkeeps keeps=1 refs refdirs rtn
460 local packedrefs=$(<"$repo/packed-refs")
461
462 # so they don't appear touched after a noop refpacking
463 if [ -z "$SW_REFS" ] ; then
464 refs=$(cd "$repo/refs" ; find -depth)
465 refdirs=$(cd "$repo/refs" ; find -type d -depth)
466 debug "Before refs:"
467 debug "$refs"
468 fi
469
470 # Find a private keep snapshot which has not changed from
471 # before our start_date so private keep deletions during gc
472 # can be detected
473 while ! array_equals pkeeps "${keeps[@]}" ; do
474 debug "Getting a private keep snapshot"
475 private_keeps "$repo"
476 keeps=("${pkeeps[@]}")
477 debug "before keeps: ${keeps[*]}"
478 start_date=$(date)
479 private_keeps "$repo"
480 debug "after keeps: ${pkeeps[*]}"
481 done
482
483 while read n has_keep size sha repack down up note; do
484 if [ "$repack" = "y" ] ; then
485 keep="$repo/objects/pack/pack-$sha.keep"
486 info "Repacking $repo/objects/pack/pack-$sha.pack"
487 [ -f "$keep" ] && rm -f "$keep"
488 fi
489 done
490
491 ( cd "$repo" && git gc "${GC_OPTS[@]}" ) ; rtn=$?
492
493 # Mark any files withoug a .keep with our .keep
494 packs=("$repo"/objects/pack/pack-$SHA1.pack)
495 for pack in "${packs[@]}" ; do
496 if keep "$pack" ; then
497 info "New pack: $pack"
498 newpacks=$((newpacks+1))
499 fi
500 done
501
502 # Record start_time. If there is more than 1 new packfile, we
503 # don't want to risk touching it with an older date since that
504 # would prevent consolidation on the next run. If the private
505 # keeps have changed, then we should run next time no matter what.
506 if [ $newpacks -le 1 ] || ! array_equals pkeeps "${keeps[@]}" ; then
507 set_start_date "$repo" "$start_date" "$refs" "$refdirs" "$packedrefs" "${packs[@]}"
508 fi
509
510 return $rtn # we really only care about the gc error code
511}
512
513git_gc() { # git.repo
514 local list=$(process_list "$1")
515 if [ -z "$SW_V" ] ; then
516 info "Running $PROG on $1. git gc options: ${GC_OPTS[@]}"
517 echo "$HEADER" >&2
518 echo "$list" >&2 ;
519 fi
520 echo "$list" | repack_list "$1"
521}
522
523
524PROG=$(basename "$0")
525HEADER="Id Keep Size Sha1(or consolidation list) Actions(repack down up note)"
526KEEP=git-exproll
527HEX='[0-9a-f]'
528HEX10=$HEX$HEX$HEX$HEX$HEX$HEX$HEX$HEX$HEX$HEX
529SHA1=$HEX10$HEX10$HEX10$HEX10
530
531RATIO=10
532SW_N='' ; SW_V='' ; SW_T='' ; SW_REFS='' ; SW_LOOSE='' ; GC_OPTS=()
533while [ $# -gt 0 ] ; do
534 case "$1" in
535 -u|-h) usage ;;
536 -n) SW_N="$1" ;;
537 -v) SW_V="$1" ;;
538
539 -t) SW_T="$1" ;;
540 --norefs) SW_REFS="$1" ;;
541 --noloose) SW_LOOSE="$1" ;;
542
543 -r|--ratio) shift ; RATIO="$1" ;;
544
545 *) [ $# -le 1 ] && break
546 GC_OPTS=( "${GC_OPTS[@]}" "$1" )
547 ;;
548 esac
549 shift
550done
551
552
553REPO="$1"
554if ! is_repo "$REPO" ; then
555 REPO=$REPO/.git
556 is_repo "$REPO" || usage "($1) is not likely a git repo"
557fi
558
559
560if [ -z "$SW_N" ] ; then
561 is_touched "$REPO" || { info "Repo untouched since last run" ; exit ; }
562 git_gc "$REPO"
563else
564 is_touched "$REPO" || info "Repo untouched since last run, analyze anyway."
565 process_list "$REPO" >&2
566fi