Added original farce-002 scripts.

2006-04-09 08:49:42 +00:00 · 2006-04-09 08:49:42 +00:00 · dc526eaef0
commit dc526eaef0
parent 11869f1ed7
2 changed files with 1035 additions and 0 deletions
--- a/extras/farce
+++ b/extras/farce
@ -0,0 +1,874 @@
+#!/bin/bash
+
+# FARCE: Farce Assists Rebuild Comparison Evaluation ;)
+#
+# to answer the question "can it rebuild itself?"
+#
+#  We expect four arguments - first directory path, filelist
+# containing the files in this directory which we wish to compare,
+# second directory path, filelist for second directory.
+#
+# Yes, we could just compare everything in each tree, but the
+# filelist script knows about files it can reasonably ignore,
+# and this also allows us to build a sytem, boot it and get a
+# list of files, build a full desktop environment, and only then
+# build and boot the "can it build itself" test system and get
+# _its_ filelist. 
+#
+# What this script aims to do:
+# ____________________________
+#
+# First, report files not in both builds.
+#
+# Then, confirm symlinks point to same targets.
+#
+# After that, compare individual files -
+#  if different, run the file name through 'expected'
+# to pick out files that are unlikely to match (logs,
+# pids, fstab [assumes '/' is a different device each time],
+# count these as 'expected'.
+#
+# For whatever is left, check the file type - ar archives
+# have their members extraced and compared (every member has
+# a timestamp), gzipped files are compared beyond their
+# timestamp, binaries, at least those using shared libs or
+# which are shared objects, are copied and subjected to
+# --strip-debug.  If files match at this stage, count them as
+# 'accepted'. 
+#
+# As a last step for any file that doesn't match, copy it
+# through some perl regexps to "process" it (convert any
+# date, time, kernel-version information from standard formats
+# into tokens, then see if the tokensi match.
+#
+# For details of the regexps, see the tokenize function.
+# Those files that match after this are also counted as
+# 'accepted'.  Note that I don't always start from the kernel
+# version that I'm going to build, so this copes with e.g. perl
+# files that hardcode the kernel version.
+#
+# We now have files that don't match.  A few of these seem to be
+# common to all builds - some (members of) c++ libraries or ar
+# archives, a few programs which perhaps use some sort of c++ code).
+# The file name # is passed to the 'failure' function - these
+# recognized filenames are labelled as 'predictable FAIL:',
+# anything else is labelled as 'unexpected FAIL:'.
+#
+# output:
+# stderr - files only in one of the builds, failure messages,
+#  and totals.
+#
+# farce-results - more details, including which files were treated
+# as expected differences, files where neither copy could be read,
+# files treated as accepted, with the reason (and member for ar
+# archives).  This data is typically up to 100 characters wide -
+# sometimes it's a bit more, but it doesn't wrap too badly in a
+# 100 character xterm.
+#
+# farce-extras - diffs for the files, or members, that didn't
+# match.  This file is to establish new regexps for picking up
+# date/time/kernel-version formats.
+#
+# farce-identical - the names of the files which are identical
+# 
+# farce-substitutions - whenever using tokenizeanddiff results in a
+# difference being accepted, for both versions diff the before and
+# after versions to show what got changed.  If the file is a binary,
+# the output may still be hard to read.  Note that I _know_ glibc
+# version strings pass one of the regexps looking for a kernel version
+# - since I expect you to use the same version of glibc for each
+# build, this is not a problem.     
+#
+# farce-differ - the names of the files which could not be treated
+# as matching (whether or not I regard the failure as predictable)
+# for possible input to ICA processing.
+#  
+# Copyright (C) 2005, 2006 Ken Moffat <ken@linuxfromscratch.org>
+#
+# All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+# NON INFRINGEMENT.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+VERSION="002"
+
+# variables for output files
+RESULT=farce-results
+EXTRAS=farce-extras
+IDENTICAL=farce-identical
+SUBS=farce-substitutions
+DIFFER=farce-differ
+
+# documenting the variables
+# C1, C2	temp files to hold disassembled code
+# D1, D2	temp directories for extracting member of ar archive
+# DIFF		temp file for diffing the filelists
+# F1, F2	temp files for tokenizeanddiff
+# FTYPE		obsolete, commented out
+# M1, M2	temp files to hold members of an ar archive
+# MEMBERS	temp file to list members of ar archive
+# OP1, OP2	the original $PWD, needed when extracting members of ar
+#		archives
+# P1, P2	paths to first and second build
+# S1, S2	temp files for shared objects
+
+# functions
+function dohelp() {
+	echo "`basename $0`: compare trees of files from a build"
+	echo "and lists of the files they contained"
+	echo ""
+	echo "`basename $0` [ -help | -version ] || path1 list1 path2 list2"
+}
+
+function emessage() {
+	# write a string to both stderr and $RESULT
+	echo "$@" >&2
+	echo "$@" >&5
+}
+
+function expected() {
+	# if we expect it to differ because of its name,
+	# allow it and report, return true ; else return false
+	case $1 in
+	/boot/grub/menu.lst)
+		# just in case somebody puts this into the main filesystem
+		true;;
+	/etc/aliases.db)
+		# some sort of database for postfix, not parsable
+		true;;
+	/etc/blkid.tab)
+		# includes dev name for rootfs
+		true;;
+	/etc/fstab)
+		# fstab, e.g. ' / ' will differ
+		true;;
+	/etc/group*)
+		true;;
+	/etc/hosts)
+		# with dhcp client, I add current ip address to this in a hook
+		true;;
+	/etc/ld.so.*)
+		# .conf and .cache can vary,
+		# particularly if one system has a full build when I run this
+		true;;
+	/etc/lilo.conf|/etc/yaboot.conf)
+		# bootloader control, I assume grub will all be on a separate
+		true;;
+	/etc/mtab)
+		# at a minimum, different '/'
+		true;;
+	/etc/ntp.drift)
+		true;;
+	/etc/passwd*)
+		true;;
+	/etc/shadow*)
+		true;;
+	/etc/ssh/*key|/etc/ssh/*pub)
+		# openssh keys
+		true;;
+	/misc/*)
+		# where I put buildscripts (which mostly won't change)
+		# and stamps containing name/time/space which will differ in the times
+		true;;
+	/root/*)
+		# expect .bash_history etc to differ - if we can read them
+		true;;
+	/usr/bin/lynx)
+		# part of my inital builds, I guess this uses anonymous namespaces
+		true;;
+	/usr/include/c++/*/*/bits/stdc++.h.gch/*)
+		# precompiled headers
+		true;;
+	/usr/lib*/libstdc++.a|/usr/lib*/libstdc++.so*|/usr/lib*/libsupc++.a)
+		# probably, anonymous namespaces
+		# libstdc++.a, libstdc++.so.n.n.n, libsupc++.a
+		true;;
+	/usr/share/info/dir)
+		# if one system has had extra stuff built, this will likely be bigger
+		true;;
+	/usr/share/man/whatis)
+		# if one system has had extra stuff built, this will likely be bigger
+		true;;
+	/var/lib/locate/locatedb)
+		# if one system has had extra stuff built, this will likely be bigger 
+		true;;
+	/var/lib/nfs/*)
+		# allow nfs bookkeeping
+		true;;
+	/var/log/*)
+		true;;
+	/var/run/utmp)
+		true;;
+	/var/spool/fcron*)
+		true;;
+	/var/state/*)
+		# allow dhcp leases
+		true;;
+	/var/tmp/random-seed)
+		true;;
+	# following start with wildcards
+	*Image*|*.PPCBoot*|*vmlinuz*|*lfskernel*)
+		# compressed kernels, sometimes just building at a different
+		# date/time is enough to change the length of them, because the
+		# long format date and time is part of the compressed data
+		true;;
+	*pid*)
+		# pids, including e.g. /var/spool/postfix/pid/*
+		true;;
+	*)
+		# nothing else is expected to be different
+		false;;
+	esac
+	if [ $? -eq 0 ]; then
+		message "expected difference in $1"
+		let expected=$expected+1
+		case $TYPE in
+			AR)
+				let EXPAR=$EXPAR+1
+				;;
+			ELF)
+				let EXPELF=$EXPELF+1
+				;;
+			UNK)
+				let EXPUNK=$EXPUNK+1
+				;;
+			# so far, no other valid types, so don't accumulate them
+			*)
+				emessage "internal error, expected difference for $1 of type $TYPE not allowed"
+				exit 2
+				;; 
+		esac
+		true
+	else
+		false
+	fi
+}
+
+function failure() {
+	# first parm is filename or token
+	# second parm is the error message
+	# update the appropriate total
+	# and write to both stderr and the results
+	# by using emessage
+
+	let different=$different+1
+	case $TYPE in
+		AR)
+			let DIFAR=$DIFAR+1
+			;;
+		ELF)
+			let DIFELF=$DIFELF+1
+			;;
+		GZ)
+			let DIFGZ=$DIFGZ+1
+			;;
+		SYM)
+			let DIFSYM=$DIFSYM+1
+			;;
+		UNK)
+			let DIFUNK=$DIFUNK+1
+			;;
+		*)
+			emessage "internal error in failure() for TYPE $TYPE"
+			exit 2
+			;;
+		esac
+		test -f ${P1}$1 && echo $1 >&9
+		emessage "FAIL: $2"
+}
+
+function fatal() {
+	# unrecoverable error
+	echo $*
+	exit 1
+}
+
+function filetype() {
+	TYPE=`file ${P1}${FILE}`
+	case $TYPE in
+	*'current ar archive'*)
+		let TOTAR=$TOTAR+1
+		TYPE=AR
+		;;
+	*' ELF '*)
+		let TOTELF=$TOTELF+1
+		TYPE=ELF
+		;;
+	*'gzip compressed data'*)
+		let TOTGZ=$TOTGZ+1
+		TYPE=GZ
+		;;
+	*)
+		let TOTUNK=$TOTUNK+1
+		TYPE=UNK
+		;;
+	esac
+}
+
+function message() {
+	# write a string to $RESULT
+	echo $* >&5
+}
+
+function onlyone() {
+	#report files only in one build
+	# text should go to both stderr and the results,
+	# but blank lines only go to the results
+	if [ $1 == '<' ]; then
+		emessage "File(s) only in the first build"
+	else
+		emessage "File(s) only in the second build"
+	fi
+	message ""
+	FILES=`cat $DIFF | grep "^$1" | cut -d ' ' -f 2`
+	for F in $FILES; do
+		emessage $F
+		let only=$only+1
+	done
+	message ""
+}
+
+# 'test' functions are called with three arguments:
+# the two pathes and the filename
+# - we know the file is of this type, so see if we
+# can get it to match by reasonalbe means.
+# if not, treat it as different.
+#
+# NB if pathes are absolute, we need to prefix them
+# with the original $PWD to access the .a files
+# 
+function testar() {
+	# ar archives include timestamps for the members,
+	# but diff doesn't show file timestamps unless the data differs
+	# put out a message to help locate which archive any messages
+	# about the members refer to.
+	
+	# try just stripping them U1,2 undebuggable
+	U1=`mktemp` || fatal "cannot create a temporary file"
+	U2=`mktemp` || fatal "cannot create a temporary file"
+	cp ${1}${3} $U1
+	cp ${2}${3} $U2
+	strip --strip-debug $U1
+	strip --strip-debug $U2
+	cmp -s $U1 $U2
+	rm $U1 $U2
+	if [ $? -eq 0 ]; then
+		let accepted=$accepted+1
+		let ACCAR=$ACCAR+1
+		message "archive $3 matches after strip --strip-debug"
+		return 
+	fi
+	# rest of this function retained primarily for pathologically bad builds
+	# put out a message in the log to help identify which archive has issues.
+	message "examining ar archive $3"
+	D1=`mktemp -d` || fatal "cannot create a temporary directory"
+	D2=`mktemp -d` || fatal "cannot create a temporary directory"
+	cd $D1
+	ar -x ${OP1}${1}${3} 
+	cd $D2
+	ar -x ${OP2}${2}${3}
+	cd
+	# diff the members - true means they match
+	diff -Na $D1 $D2 >/dev/null
+	if [ $? -eq 0 ]; then
+		message "accept: $3 after diffing the members"
+		let accepted=$accepted+1
+		let ACCAR=$ACCAR+1
+	else
+		# process individual members to eliminate date/time/kernel-version
+		# first, check the members are the same
+		M1=`mktemp` || fatal "cannot create a temporary file"
+		M2=`mktemp` || fatal "cannot create a temporary file"
+		cd $D1
+		MEMBERS=
+		for F in *; do
+			MEMBERS="$MEMBERS $F"
+		done
+		cd
+		echo $MEMBERS | sort >$M1
+		cd $D2
+		MEMBERS=
+		for F in *; do
+			MEMBERS="$MEMBERS $F"
+		done
+		cd
+		echo $MEMBERS | sort >$M2
+		cmp -s $M1 $M2
+		if [ $? -ne 0 ]; then
+			# oh dear, different members
+			echo "list of members differs for archive $3" >&6
+			diff $M1 $M2 >&6
+			failure $3 "$3 list of members differs"
+		else
+			# members (names) are same,
+			# process each one
+			STATUS=0
+			for M in $MEMBERS; do
+				#avoid firing up perl on matching members
+				cmp -s $D1/$M $D2/$M
+				if [ $? -ne 0 ]; then
+					tokenizeanddiff $D1/$M $D2/$M $FILE:$M
+					if [ $? -eq 0 ]; then
+						message "member $M matches after processing"
+					else
+						message "member $M DIFFERS after processing"
+						STATUS=1
+					fi
+				fi
+			done
+			if [ $STATUS -eq 0 ]; then
+				let accepted=$accepted+1
+				let ACCAR=$ACCAR+1
+			else
+				let different=$different+1
+				let DIFAR=$DIFAR+1
+				echo $3 >&9
+				emessage "FAIL: in $3"
+			fi
+		fi
+		rm $M1 $M2
+	fi
+	rm -rf $D1 $D2
+}
+
+function testgzip() {
+	# bytes 4,5,6,7 are the timestamp, so ignore these
+	cmp -s -i 8 ${1}${3} ${2}${3}
+	if [ $? -eq 0 ]; then
+		message "accept: $3 after ignoring gzip timestamp"
+		let accepted=$accepted+1
+		let ACCGZ=$ACCGZ+1
+	else
+		failure $3 " $3 even after ignoring gzip timestamp"
+	fi
+}
+
+function testso() {
+	# shared object - first try stripping it
+	# in fact, this now handles ALL ELF files
+	S1=`mktemp` || fatal "cannot create a temporary file"
+	S2=`mktemp` || fatal "cannot create a temporary file"
+	cp ${1}${3} $S1
+	strip --strip-debug $S1
+	cp ${2}${3} $S2
+	strip --strip-debug $S2
+	cmp -s $S1 $S2
+	if [ $? -eq 0 ]; then
+		message "accept: $3 after --strip-debug"
+		let accepted=$accepted+1
+		let ACCELF=$ACCELF+1
+	else
+		tokenizeanddiff $S1 $S2 $3
+		if [ $? -ne 0 ]; then
+			failure $3 " $3 differs after stripping and processing"
+		else
+			message "accept: $3 after --strip-debug and processing"
+			let accepted=$accepted+1
+			let ACCELF=$ACCELF+1
+		fi
+	fi
+	rm $S1 $S2
+}
+
+function tokenize() {
+	# use regexes to replace date/time/kernel-version text
+	# with tokens which may allow files to match even though
+	# they have hardcoded date/time/kernel-version.
+	# arguments are file to process, and where to put it.
+	# these regexes are somewhat long, and the order they
+	# are applied in is important (to stop short ones being
+	# used when a longer version would match).
+	# KV00 linux version date (e.g. as in the kernel itself)
+	# allow 2 or 3 groups of three alphas here - optional smp,  with day, mon
+	# KV01 kernel version, including possible cpu details (that is for cdda2wav)
+	# KV02 just the version, in quotes e.g. "2.6.12.6" or '2.6.13', for perl stuff
+	# except that "|' gives me grif, so try a boundary
+	# also, it might need local version on the end, I really want
+	# quote2.\d+.\d+.{0,32}quote - it is the quotes that don't work.
+	# DT00 Day Mon .d+ hh:mm:ss TZN CCYY variations include non-caps and 'mon d'
+	# DT01 Mon .d+ CCYY hh:mm:ss
+	# DT02 hh:mm:ss Mon .d CCYY
+	# DT03 Mon .d CCYY
+	# DT04 Day Mon { ,d}d hh:mm:ss CCYY - for groff example postscript files
+	# (somewhat similar to DT00, but '  d' or ' dd' for day of month and no TZN )
+	# DT05 hh:mm:ss
+	# DT06 ISO date using space as separator
+	# DT07 ISO date using dash as separator
+	# DT08 ISO date using slash as separator
+	# DT09 fullmonth (capitalised), day number, comma, 4-digit year (groff 1.18.1 ps)
+	# DT10 dd, fullmonth (capitalised), 4-digit year (groff 1.18.1 manpages)
+	# DT11 '(xample comma space digit(s) backslash ) in groff memef.ps which is
+	# quite clearly the day of the month when it was compiled, preceded by 'example'
+	# with something weird between the e and the x.
+
+	if [ $# -ne 2 ]; then
+		fatal "tokenizing called with $# arguments : $*"
+	fi
+
+	cat $1 | perl -p \
+	 -e 's/(L|l)inux.*\d\.\d\.\d+.* \#\d+( [A-Za-z][a-z]{2}){2,3} \d+ \d\d:\d\d:\d\d [A-Za-z]{3} \d{4}\b/%KV00%/g;' \
+	 -e 's/(L|l)inux( (\w|_)+)?(-| |_)\d\.\d(\.\d+){1,2}((-|_)?(\w|_)+)?( |\000)*/%KV01%/g;' \
+	 -e 's/\W2(\.\d+){2,3}(-|_)?((\w|_)+)?\s*\W/%KV02%/g;' \
+	 -e 's/\b([A-Za-z][a-z]{2} ){2}( |\d)?\d \d\d:\d\d:\d\d [A-Za-z]{3} \d{4}\b/%DT00%/g;' \
+	 -e 's/\b[A-Z][a-z]{2} ( |\d)\d \d{4} \d\d:\d\d:\d\d\b/%DT01%/g;' \
+	 -e 's/\b\d\d:\d\d:\d\d [A-Z][a-z]{2} ( |\d)\d \d{4}\b/%DT02%/g;' \
+	 -e 's/\b[A-Z][a-z]{2} ( |\d)\d \d{4}\b/%DT03%/g;' \
+	 -e 's/\b([A-Z][a-z]{2} ){2}( |\d)\d \d\d:\d\d:\d\d \d{4}/%DT04%/g;' \
+	 -e 's/\b\d\d:\d\d:\d\d\b/%DT05%/g;' \
+	 -e 's/\b\d{4} \d\d \d\d\b/%DT06%/g;' \
+	 -e 's/\b\d{4}-\d\d-\d\d\b/%DT07%/g;' \
+	 -e 's/\b\d{4}\/\d\d\/\d\d\b/%DT08%/g;' \
+	 -e 's/\b[A-Z][a-z]{2,} \d{1,2}, \d{4}/%DT09%/g;' \
+	 -e 's/\b\d\d [A-Z][a-z]{2,} \d{4}/%DT10%/g;' \
+	 -e 's/\(xample, \d{1,2}\\\)/%DT11%/g;' \
+	>$2
+}
+
+function tokenizeanddiff() {
+	# Call tokenize for the inputs, then compare the results
+	# Input arguments are path/filename for old and new versions
+	# third parm is readable name (filename, or archivename:member)
+	# to help understand what is in the extras output.
+	#  - sometimes called for files, but other times called for
+	# members of ar archives extracted into temporary directories
+	#message tokenizeanddiff called for $1 $2 $3
+	F1=`mktemp` || fatal "cannot create a temporary file"
+	F2=`mktemp` || fatal "cannot create a temporary file"
+	tokenize $1 $F1
+	tokenize $2 $F2
+
+	# actually, cmp is probably more efficient 
+	# but for picking up the pieces it will be better to
+	# use diff to see what got through.
+	cmp -s $F1 $F2
+	TOKENRESULT=$?	
+	if [ $TOKENRESULT -ne 0 ]; then
+		echo "failure in $3..." >&6
+		diff -a $F1 $F2 >&6
+		rm $F1 $F2
+		false
+	else
+		# show what we did
+		echo "substitutions for $3" >&8
+		echo "build one" >&8
+		diff -a $1 $F1 >&8
+		echo "build two" >&8
+		diff -a $2 $F2 >&8
+		rm $F1 $F2
+		true
+	fi
+}
+
+function validateargs() {
+# validate the arguments
+BAD=0
+if ! [ -d $1 ]; then
+	echo "Error: first argument is not a directory" >&2
+	let BAD=$BAD+1
+fi
+NAME=`basename ${2%%-*}`
+if [ $NAME != filelist ]; then
+	echo "Error: second argument is not a recognized filelist" >&2
+	let BAD=$BAD+1
+fi
+if ! [ -d $3 ]; then
+	echo "Error: third argument is not a directory" >&2
+	let BAD=$BAD+1
+fi
+NAME=`basename ${4%%-*}`
+if [ $NAME != filelist ]; then
+	echo "Error: fourth argument is not a recognized filelist" >&2
+	let BAD=$BAD+1
+fi
+for I in $1 $2 $3 $4; do
+	if ! [ -r $I ]; then
+		echo "Error: cannot read $I" >&2
+		let BAD=$BAD+1
+	fi
+done
+if [ $1 == $3 ]; then
+	echo "Error: directory pathes are identical" >&2
+	let BAD=$BAD+1
+fi
+if [ $2 == $4 ]; then
+	echo "Error: filelist names are identical" >&2
+	let BAD=$BAD+1
+fi
+if [ $BAD -eq 0 ]; then
+	ARGS=valid
+fi
+}
+
+# Mainline
+ARGS=unproven
+OUTDIR=
+if [ $# -eq 1 ]; then
+	case $1 in
+		-version|--version)
+			echo "`basename $0` version $VERSION"
+			exit 0
+			;;
+		-help|--help)
+			dohelp
+			exit 0
+			;;
+	esac
+fi
+if [ $1 = "--directory" ]; then
+	OUTDIR=$2
+	shift 2
+	grep '/$' $OUTDIR >/dev/null 2>&1 || OUTDIR=`echo $OUTDIR | sed 's%$%/%'`
+	echo "creating directory $OUTDIR"
+	mkdir -p $OUTDIR
+	if [ $? -ne 0 ]; then
+		echo "cannot mkdir $OUTDIR"
+		exit 1
+	fi
+fi
+if [ $# -eq 4 ]; then
+	validateargs $*
+fi
+if ! [ $ARGS == valid ]; then
+	dohelp
+	fatal "`basename $0`: error in arguments"	
+fi
+
+# ok, we're happy, lets hit these files
+exec 5>${OUTDIR}$RESULT
+exec 6>${OUTDIR}$EXTRAS
+exec 7>${OUTDIR}$IDENTICAL
+exec 8>${OUTDIR}$SUBS
+exec 9>${OUTDIR}$DIFFER
+
+>${OUTDIR}$RESULT
+if [ $? -ne 0 ]; then
+	fatal "cannot write to ${OUTDIR}$RESULT"
+fi
+
+emessage "will compare:"
+emessage " first build at $1 with files listed in $2"
+emessage "second build at $3 with files listed in $4"
+
+let accepted=0
+let different=0
+let expected=0
+let matched=0
+let only=0
+let predictable=0
+let unreadable=0
+let total=0
+
+# break down the accepted
+let ACCAR=0
+let ACCELF=0
+let ACCGZ=0
+let ACCUNK=0
+
+# break down definitely different
+let DIFAR=0
+let DIFELF=0
+let DIFGZ=0
+let DIFSYM=0
+let DIFUNK=0
+
+# break down the expected differences
+let EXPAR=0
+let EXPELF=0
+let EXPGZ=0
+let EXPUNK=0
+
+# break down the identical files
+let MATAR=0
+let MATELF=0
+let MATGZ=0
+let MATSYM=0
+let MATUNK=0
+
+# break down how many of each type
+let TOTAR=0
+let TOTELF=0
+let TOTGZ=0
+let TOTSYM=0
+let TOTUNK=0
+
+# now identify differences between the two trees
+DIFF=`mktemp` || fatal "cannot create a temporary file"
+diff $2 $4 >$DIFF
+
+for RUN in '<' '>' ; do
+	grep -q "$RUN" $DIFF && onlyone "$RUN"
+done
+
+rm $DIFF
+
+# and compare them
+message "Results of file comparison:"
+message ""
+
+# Strip any trailing slash from the path for tidyness,
+# because the filenames all start with a slash.  Unfortunately,
+# unfortunately, '/' becomes empty, which breaks subroutines,
+# so special case it.
+# also, to process ar archives we need to extract them in temp
+# directories - that means that after cd'ing we've broken any
+# relative path, so save original pwd as necessary.
+P1=`echo $1 | sed 's%/$%%'`
+echo $1 | grep '^/' >/dev/null
+if [ $? -ne 0 ]; then
+	# relative path
+	OP1=${PWD}/
+	#echo "setting OP1 to $OP1"
+else
+	OP1=
+	#echo "$1 is an absolute path"
+fi
+test -z "$P1" && P1='/'
+P2=`echo $3 | sed 's%/$%%'`
+echo $3 | grep '^/' >/dev/null
+if [ $? -ne 0 ]; then
+	# relative path
+	OP2=${PWD}/
+	#echo "setting OP2 to $OP2"
+else
+	OP2=
+	#echo "$3 is an absolute path"
+fi
+test -z "$P2" && P2='/'
+
+echo "about to read $2"
+while read FILE ; do
+#echo "process $FILE"
+#echo "test existence of ${P2}${FILE}"
+	# confirm it exists in second build
+	# we have already reported files only in one build
+	if [ -f ${P2}"${FILE}" ]; then
+		let total=$total+1
+		# check we can read both of them
+		# or count as unreadable - I used to separate only-one-unreadable,
+		# but if you compre '/' and a _copy_  of /mnt/lfs that assumption
+		# breaks, so be less picky.
+		if ! [ -r "${P1}${FILE}" ] || ! [ -r "${P2}${FILE}" ]; then
+			message "cannot read one or both versions of $FILE"
+			let unreadable=$unreadable+1
+			continue
+		fi
+		if [ -h "${P1}${FILE}" ]; then
+			# for symlink, look at what it points to
+			# exceptionally, do not call filetype
+			TYPE=SYM
+			let TOTSYM=$TOTSYM+1
+			SL1=`ls -l "${P1}${FILE}" | awk '{ print $11 }'`
+			SL2=`ls -l "${P2}${FILE}" | awk '{ print $11 }'`
+			if [ "$SL1" = "$SL2" ]; then
+				echo "symlink $FILE matches for $SL1" >&5
+				let matched=$matched+1
+				let MATSYM=$MATSYM+1
+			else
+				failure TARGET " symlink $FILE points to $SL1 and $SL2"
+				echo $FILE >&9
+			fi
+		else
+			# regular file, start by typing it for accounting,
+			# then compare it
+			filetype ${P1}${FILE}
+			cmp -s "${P1}${FILE}" "${P2}${FILE}"
+			if [ $? -eq 0 ]; then
+				let matched=$matched+1
+				case $TYPE in
+					AR)
+						let MATAR=$MATAR+1
+						;;
+					ELF)
+						let MATELF=$MATELF+1
+						;;
+					GZ)
+						let MATGZ=$MATGZ+1
+						;;
+					UNK)
+						let MATUNK=$MATUNK+1
+						;;
+					*)
+						echo "unexpected TYPE of $TYPE for $FILE" >&2
+						exit 2
+						;;
+				esac
+				echo ${FILE} >&7
+			else
+				# seems different, can we do better ?
+				# test if we expect it to differ
+				expected $FILE
+				if [ $? -ne 0 ]; then
+					case $TYPE in
+					GZ)
+						testgzip $P1 $P2 $FILE ;;
+					AR)
+						testar $P1 $P2 $FILE ;;
+					ELF)
+						testso $P1 $P2 $FILE ;;
+					*)
+						# long-stop - strip dates from text files
+						tokenizeanddiff "${P1}${FILE}" "${P2}${FILE}" "$FILE"
+						if [ $? -eq 0 ]; then
+							message "accepted $FILE after processing"
+							let accepted=$accepted+1
+							let ACCUNK=$ACCUNK+1
+						else
+							failure "$FILE" " $FILE is different"
+						fi
+						;;
+					esac
+				fi
+			fi
+		fi
+	fi
+done < $2
+
+message ""
+# write totals to stderr as well as the results file
+emessage "$only files in only one of the builds"
+emessage "$total files compared, of which"
+emessage "$unreadable files could not be read, skipped"
+emessage "$matched files are identical"
+emessage "$expected files differed as expected"
+emessage "$accepted files had allowable differences"
+#emessage "$predictable files differed as they normally do"
+emessage "$different files differed"
+
+# totals of different file types
+emessage ""
+emessage "$TOTAR ar archives"
+emessage " of which $MATAR are identical"
+emessage " of which $ACCAR are accepted after strip-debug or extracting, diffing, tokenizing"
+emessage " of which $EXPAR differed as expected"
+emessage "    of which $DIFAR differed"
+emessage "$TOTELF ELF executables or shared libraries"
+emessage " of which $MATELF are identical"
+emessage " of which $ACCELF are accepted after stripping and tokenizing"
+emessage " of which $EXPELF differed as expected"
+emessage "    of which $DIFELF differed"
+emessage "$TOTGZ gzipped files"
+emessage " of which $MATGZ are identical"
+emessage " of which $ACCGZ are accepted after comparing beyond timestamp"
+emessage " of which $DIFGZ are different"
+emessage "$TOTSYM symbolic links"
+emessage " of which $MATSYM are identical"
+emessage "    of which $DIFSYM have different targets"
+emessage "$TOTUNK other files"
+emessage " of which $MATUNK are identical"
+emessage " of which $ACCUNK are accepted after tokenizing"
+emessage " of which $EXPUNK differed as expected"
+emessage "    of which $DIFUNK differed"
+
--- a/extras/filelist
+++ b/extras/filelist
@ -0,0 +1,161 @@
+#!/bin/bash
+#
+# filelist - prepare a list of the files in a built system.
+# This is for use in comparing two builds, to answer the
+# question "can it build itself" ?  Obviously, it can also be
+# used to compare two partial builds to see what changed.
+#
+# Call this script with the path to the root of the system.
+# If you are running the new system, the path is probably '/'.
+# Alternatively, the system might be mounted at /mnt/lfs, or
+# it might have been copied to ~/build1 or wherever.
+#
+# The output is a single file, filelist-CCYYMMDDhhmm
+# e.g. filelist-200510052108
+# which contains a list of the files to compare.
+# It excludes certain files which are not of interest (/tools,
+# /cross-tools, /home) together with any mounted filesystems.
+#
+# you should run this as a regular user - this will cause files
+# in e.g. /root to be ignored.
+#
+# I like to build a graphical desktop before using a new system
+# to see if it can build itself.  Filelist supports this by allowing
+# you to list the files at any time.  My normal process is:
+#
+# 1. Build an LFS system as normal.
+# 2. Build some extras before booting (nfs client, ntp, openssh,
+#    lynx, dhcp client, etc).
+# 3. Boot the new (first) system, set up users, run filelist.
+# 4. Build X and whatever else I want to use.
+# 5. Build the second system.
+# 6. Build the same extras for the second system.
+# 7. Boot the second system, to prove it works.
+# 8. Reboot to the first system, mount the second system at /mnt/lfs,
+#    then run filelist on /mnt/lfs and run the comparison.
+# 
+# Copyright (C) 2005, 2006 Ken Moffat <ken@linuxfromscratch.org>
+#
+# All rights reserved.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+# NON INFRINGEMENT.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+VERSION="002"
+
+function error() {
+	echo "usage:"
+	echo "  `basename $0` [--whole-build] path name for output file]"
+	echo " e.g. '/' or '/mnt/lfs' or 'build1'"
+	echo " use --whole-build to compare files from /tools or /cross-tools"
+	echo
+	echo "Error: $1"
+	exit 1
+}
+
+
+# process the argument, if there is one
+# normally, we exclude files in /tools and /cross-tools
+EXCLUDE=true
+
+case "$1" in
+
+--whole-build)
+	# do NOT exclude cross-tools and tools
+	EXCLUDE=
+	shift
+	;;
+--*)
+	error "Bad option $1"
+	;;
+
+esac
+
+
+if [ $# -lt 1 ]; then
+	error "no argument"
+elif [ $1 = "--version" ]; then
+	echo "$0 version $VERSION"
+	exit
+elif [ $# -gt 2 ]; then
+	error "more than two arguments"
+elif ! [ -d $1 ]; then
+	error "not a directory"
+fi
+if [ $# -eq 2 ]; then
+	OUTFILE=$2
+	if [ -e $2 ]; then
+		echo "output $2 already exists"
+		exit
+	fi
+else
+	NOW=`date +%Y%m%d%H%M`
+	OUTFILE=~/filelist-${NOW}
+fi
+
+if [ "$1" == "/" ]; then
+	LOC=$1
+else
+	# ensure the path or mountpoint ends with a slash
+	# because of the seds after the 'find'
+	LOC=`echo $1 | sed 's%[^/]$%&/%'`
+fi
+
+echo "will create file list in $OUTFILE"
+if [ -f $OUTFILE ]; then
+	echo "refusing to overwrite $OUTFILE"
+	exit 1
+fi
+
+# check we can indeed do this
+>$OUTFILE
+if [ $? -ne 0 ]; then
+	echo "error, cannot write to $OUTFILE"
+	exit 1
+fi
+
+# Explanation of the find command: we exclude any filesystems mounted below
+# this path (typically, you are looking at '/' so this excludes /proc /sys
+# /dev).  We only care about files, not directories.
+# Exclusions - insert whatever we were given at the start of the pathes to
+# exclude - this might mean we have '//' in them, but that shouldn't matter.
+# /tools* /tools and also e.g. /tools.old if you rename an old version
+# /cross-tools* similar, for cross-lfs
+# /home/* - in case /home is not a separate filesystem
+# /sources/* - where the book thinks you should keep sources
+# /tmp/* - we really don't care about any junk left in here
+# /misc/* - where I keep buildscripts and stamps
+# we then sed it so that / or /mnt/lfs/ or whatever are all converted to '/'.
+
+# At one time, EXCLUDE contained the ! -path strings for cross-tools and tools
+# but htat didn't work, so now it is just a marker to control this logic.
+if [ -z "$EXCLUDE" ]; then
+	find $LOC -xdev -xtype f \
+	 ! -path "${LOC}home/*" \
+	 ! -path "${LOC}sources/*" ! -path "${LOC}tmp/*" \
+	 ! -path "${LOC}misc/*" | sed "s%^${LOC}%/%" | sort >$OUTFILE
+else
+	find $LOC -xdev -xtype f \
+	 ! -path "${LOC}cross-tools*/*" ! -path "${LOC}tools/*" \
+	 ! -path "${LOC}home/*" \
+	 ! -path "${LOC}sources/*" ! -path "${LOC}tmp/*" \
+	 ! -path "${LOC}misc/*" | sed "s%^${LOC}%/%" | sort >$OUTFILE
+fi
+
+exit
+
+
+