#!/bin/csh -f # lvrfy: A HTML Link Verification utility # version 1.6d # 6 December 1995 # # By Preston Crow # http://www.cs.dartmouth.edu/~crow/ # http://www.cs.dartmouth.edu/~crow/lvrfy.html # Copyright (c) 1995 # Restriction on use: # Significant modifications must be made available, free of charge # or restriction, to Preston Crow. # May not be redistributed in a modified form without notifying # Preston Crow. # # Syntax: # lvrfy startURL fromURL OKfile BADfile OFFSITEfile # I use: # (date;lvrfy / X lvrfy.OK lvrfy.BAD lvrfy.OFF;date) |& tee lvrfy.ERR |& Mail -s 'lvrfy completed on '`hostname` `whoami` & # # Customizations: set SERVER="coos.dartmouth.edu" set SLASH="/usr/local/etc/httpd/htdocs" set PUBLIC="public_html" # List all valid index filenames, in order of preferance set INDEX=(index.html) # Temporary directory to use set TMP="/tmp" # Maximum nesting level, to avoid filling up the process table set MAXNEST=6 # Aliases must be hard-coded in this version # # Known bugs: # * Doesn't handle tags in comments correctly. It may fault on: # , or otherwise get confused. # * Doesn't handle unclosed tags. # * May seg fault on non-text or other pathalogical input cases. # * May leave files in TMP when it doesn't complete successfully. # * Doesn't recognize aliased directories, so links to aliased # files will be reported as bad. # --you can manually add aliasing below, if you can # follow the `sed` syntax. # * Certain pathalogical file or directory names may confuse # it, but these should be quite rare. # --I think I fixed most of these now. # # Warning: This script isn't secure, and shouldn't be run as root. # I'm not sure if it is possible for a carefully constructed pathalogical # case to misdirect the script, causing unexpected or dangerous side effects. # if ( $7 != '' || $5 == '' ) then echo $0 $argv echo Usage: 'lvrfy startURL fromURL OKfile BADfile OFFSITEfile' sleep 60 exit 1 endif set NEST=$6 if ($NEST == '') set NEST=0 @ NEXTNEST = $NEST + 1 if ( $NEST == $MAXNEST ) set NEXTNEST=1 # # Set variables # set INURL="$1" set PAGE="$1" set OKFILE=$3 set BADFILE=$4 set OFFFILE=$5 touch $OKFILE # # Avoid pathalogical URLs # set BAD=no # set | grep ^argv | sed -e 'sX^argv[ ]*(XX;sX .*XX' echo "$PAGE" | grep '[][{}()$&*?\!;"'"'"'`]' >/dev/null && set BAD=yes if ($BAD == yes ) then echo Link to unprocessable URL "$INURL" from "$2" >> $BADFILE exit 0 endif # # Convert URL to filename # if ( "tilde" == "`echo $PAGE | grep '^/~' >& /dev/null && echo tilde`" ) then set PAGE="`echo $PAGE | sed 'sX^/~[^/]*X&/'$PUBLIC'X' | cut -c2- `" set PAGE=`echo $PAGE |& grep ^/` if ( "$PAGE" == "" ) then echo Link to non-existent user: "$INURL" from "$2" >> $BADFILE exit 0 endif else set PAGE=$SLASH''$PAGE endif # # Deal with aliases # set PAGE=`echo $PAGE | sed -e 'sX^'$SLASH'/cgi-bin/imagemap/.*X/dev/nullX;sX^'$SLASH'/icons/X/../icons/X;sX^'$SLASH'/cgi-bin/X/../cgi-bin/X'` # # Deal with directory indices. # if (-d $PAGE) then # Compensate for URL's missing the trailing / set INURL=`echo $INURL/|sed sX//X/Xg` set USEINDEX set ICOUNT=0 while ($ICOUNT < $#INDEX ) @ ICOUNT = $ICOUNT + 1 set PAGE2=$PAGE/$INDEX[$ICOUNT] if ( -e "$PAGE2" ) break end set PAGE=`echo $PAGE2|sed sX//X/Xg` endif set CDIR=`echo $INURL|sed 'sX/[^/]*$X/Xg'` # # Compensate for symbolic links in the file # #set PAGEDIR=`echo $PAGE | sed 'sX/[^/]*$XXg'` set PAGEDIR=$PAGE:h set PAGE2=`cd $PAGEDIR>&/dev/null;pwd>&/dev/null&&pwd||echo $PAGEDIR` #if ($PAGE2 != '') set PAGE=$PAGE2/`echo $PAGE|sed 'sX.*/XXg'` if ($PAGE2 != '') set PAGE=$PAGE2/$PAGE:t # # What is the status of this file? (processed, non-existent?) # grep ^"$PAGE" $OKFILE >&/dev/null && exit 0 if ( ! -e "$PAGE") then if ($?USEINDEX) then echo Link to server-generated index page "$INURL" from "$2" >> $BADFILE exit 0 endif echo Link to non-existent page "$INURL" from "$2" >> $BADFILE exit 0 else if ( -r "$PAGE" ) then echo "$PAGE" "$2" >> $OKFILE else echo Link to unreadable page "$INURL" from "$2" >> $BADFILE exit 0 endif endif # # If filename doesn't end in "html," skip it. # echo $PAGE | grep 'html$' >&/dev/null || exit 0 # # OK, we have a new file to process here. Find the links and recurse # sed -e '\ :ok\ sX *X Xg\ sX\nX Xg\ sX *X Xg\ sX[ ]*=[ ]*X=Xg\ sX [^<>HhSs ][^<> ]*X Xg\ /<[^>]*$/N\ /\n/b ok\ sX\nX Xg\ sX[^<>]*[^<>]*X >\\ Xg\ sX *X Xg\ sX< \!.*XXg\ :end' $PAGE | sed -n -e '\ sX<.*[Hh][Rr][Ee][Ff]=XHREF=Xg\ sX<.*[Ss][Rr][Cc]=XHREF=Xg\ sX .*XXg\ sX"XXg\ sX#.*$XX\ sX?.*$XX\ sX[Hh][Tt][Tt][Pp]:Xhttp:Xg\ sXhttp://'$SERVER'XXg\ /http:[/][^/]/sXhttp:XXg\ /^HREF=./b next\ b end\ :next\ sX^HREF=XXg\ sX:X:Xgw '$TMP/lvrfy.$OFFFILE.2'\ /^[^/]/s+^+'$CDIR'+g\ :ok\ sX/[^/]*/\.\./X/Xg\ sX/\./X/Xg\ sX//X/Xg\ t ok\ /^[^:]*$/p\ :end' |sed -e sX\'X\'\"\'\"\'Xg';sX\!X\\\!Xg' | awk '{printf("%s '"'"'%s'"'"' '"'"'%s'"'"' %s %s %s %s\n","'$0'",$1,"'$INURL'","'$OKFILE'","'$BADFILE'","'$OFFFILE'","'$NEXTNEST'")}' >$TMP/lvrfy.$NEST # Last sed to escape embedded apostrophes and exclamation marks to avoid conflict. awk '{print $1, "'$PAGE'"}' $TMP/lvrfy.$OFFFILE.2 >>$OFFFILE rm $TMP/lvrfy.$OFFFILE.2 # # Now recurse, or save the work for later, if we're at the maximum depth. # if ( $NEST == $MAXNEST ) then cat $TMP/lvrfy.$NEST >> $TMP/lvrfy.work rm $TMP/lvrfy.$NEST else echo exec rm $TMP/lvrfy.$NEST >> $TMP/lvrfy.$NEST if ( $NEST > 0 ) then exec csh -f <$TMP/lvrfy.$NEST else csh -f <$TMP/lvrfy.0 # # Now do any work that we couldn't do at a deeper depth. # while ( -f $TMP/lvrfy.work ) mv $TMP/lvrfy.work $TMP/lvrfy.0 echo rm $TMP/lvrfy.0 >> $TMP/lvrfy.0 csh -f <$TMP/lvrfy.0 end endif endif