#!/bin/csh -f
# lvrfy: A HTML Link Verification utility
# version 1.6d
# 6 December 1995
#
# By Preston Crow
# http://www.cs.dartmouth.edu/~crow/
# http://www.cs.dartmouth.edu/~crow/lvrfy.html
# Copyright (c) 1995
# Restriction on use:
# Significant modifications must be made available, free of charge
# or restriction, to Preston Crow.
# May not be redistributed in a modified form without notifying
# Preston Crow.
#
# Syntax:
# lvrfy startURL fromURL OKfile BADfile OFFSITEfile
# I use:
# (date;lvrfy / X lvrfy.OK lvrfy.BAD lvrfy.OFF;date) |& tee lvrfy.ERR |& Mail -s 'lvrfy completed on '`hostname` `whoami` &
#
# Customizations:
set SERVER="coos.dartmouth.edu"
set SLASH="/usr/local/etc/httpd/htdocs"
set PUBLIC="public_html"
# List all valid index filenames, in order of preferance
set INDEX=(index.html)
# Temporary directory to use
set TMP="/tmp"
# Maximum nesting level, to avoid filling up the process table
set MAXNEST=6
# Aliases must be hard-coded in this version
#
# Known bugs:
# * Doesn't handle tags in comments correctly. It may fault on:
# , or otherwise get confused.
# * Doesn't handle unclosed tags.
# * May seg fault on non-text or other pathalogical input cases.
# * May leave files in TMP when it doesn't complete successfully.
# * Doesn't recognize aliased directories, so links to aliased
# files will be reported as bad.
# --you can manually add aliasing below, if you can
# follow the `sed` syntax.
# * Certain pathalogical file or directory names may confuse
# it, but these should be quite rare.
# --I think I fixed most of these now.
#
# Warning: This script isn't secure, and shouldn't be run as root.
# I'm not sure if it is possible for a carefully constructed pathalogical
# case to misdirect the script, causing unexpected or dangerous side effects.
#
if ( $7 != '' || $5 == '' ) then
echo $0 $argv
echo Usage: 'lvrfy startURL fromURL OKfile BADfile OFFSITEfile'
sleep 60
exit 1
endif
set NEST=$6
if ($NEST == '') set NEST=0
@ NEXTNEST = $NEST + 1
if ( $NEST == $MAXNEST ) set NEXTNEST=1
#
# Set variables
#
set INURL="$1"
set PAGE="$1"
set OKFILE=$3
set BADFILE=$4
set OFFFILE=$5
touch $OKFILE
#
# Avoid pathalogical URLs
#
set BAD=no
# set | grep ^argv | sed -e 'sX^argv[ ]*(XX;sX .*XX'
echo "$PAGE" | grep '[][{}()$&*?\!;"'"'"'`]' >/dev/null && set BAD=yes
if ($BAD == yes ) then
echo Link to unprocessable URL "$INURL" from "$2" >> $BADFILE
exit 0
endif
#
# Convert URL to filename
#
if ( "tilde" == "`echo $PAGE | grep '^/~' >& /dev/null && echo tilde`" ) then
set PAGE="`echo $PAGE | sed 'sX^/~[^/]*X&/'$PUBLIC'X' | cut -c2- `"
set PAGE=`echo $PAGE |& grep ^/`
if ( "$PAGE" == "" ) then
echo Link to non-existent user: "$INURL" from "$2" >> $BADFILE
exit 0
endif
else
set PAGE=$SLASH''$PAGE
endif
#
# Deal with aliases
#
set PAGE=`echo $PAGE | sed -e 'sX^'$SLASH'/cgi-bin/imagemap/.*X/dev/nullX;sX^'$SLASH'/icons/X/../icons/X;sX^'$SLASH'/cgi-bin/X/../cgi-bin/X'`
#
# Deal with directory indices.
#
if (-d $PAGE) then
# Compensate for URL's missing the trailing /
set INURL=`echo $INURL/|sed sX//X/Xg`
set USEINDEX
set ICOUNT=0
while ($ICOUNT < $#INDEX )
@ ICOUNT = $ICOUNT + 1
set PAGE2=$PAGE/$INDEX[$ICOUNT]
if ( -e "$PAGE2" ) break
end
set PAGE=`echo $PAGE2|sed sX//X/Xg`
endif
set CDIR=`echo $INURL|sed 'sX/[^/]*$X/Xg'`
#
# Compensate for symbolic links in the file
#
#set PAGEDIR=`echo $PAGE | sed 'sX/[^/]*$XXg'`
set PAGEDIR=$PAGE:h
set PAGE2=`cd $PAGEDIR>&/dev/null;pwd>&/dev/null&&pwd||echo $PAGEDIR`
#if ($PAGE2 != '') set PAGE=$PAGE2/`echo $PAGE|sed 'sX.*/XXg'`
if ($PAGE2 != '') set PAGE=$PAGE2/$PAGE:t
#
# What is the status of this file? (processed, non-existent?)
#
grep ^"$PAGE" $OKFILE >&/dev/null && exit 0
if ( ! -e "$PAGE") then
if ($?USEINDEX) then
echo Link to server-generated index page "$INURL" from "$2" >> $BADFILE
exit 0
endif
echo Link to non-existent page "$INURL" from "$2" >> $BADFILE
exit 0
else
if ( -r "$PAGE" ) then
echo "$PAGE" "$2" >> $OKFILE
else
echo Link to unreadable page "$INURL" from "$2" >> $BADFILE
exit 0
endif
endif
#
# If filename doesn't end in "html," skip it.
#
echo $PAGE | grep 'html$' >&/dev/null || exit 0
#
# OK, we have a new file to process here. Find the links and recurse
#
sed -e '\
:ok\
sX *X Xg\
sX\nX Xg\
sX *X Xg\
sX[ ]*=[ ]*X=Xg\
sX [^<>HhSs ][^<> ]*X Xg\
/<[^>]*$/N\
/\n/b ok\
sX\nX Xg\
sX[^<>]*[^<>]*X >\\
Xg\
sX *X Xg\
sX< \!.*XXg\
:end' $PAGE | sed -n -e '\
sX<.*[Hh][Rr][Ee][Ff]=XHREF=Xg\
sX<.*[Ss][Rr][Cc]=XHREF=Xg\
sX .*XXg\
sX"XXg\
sX#.*$XX\
sX?.*$XX\
sX[Hh][Tt][Tt][Pp]:Xhttp:Xg\
sXhttp://'$SERVER'XXg\
/http:[/][^/]/sXhttp:XXg\
/^HREF=./b next\
b end\
:next\
sX^HREF=XXg\
sX:X:Xgw '$TMP/lvrfy.$OFFFILE.2'\
/^[^/]/s+^+'$CDIR'+g\
:ok\
sX/[^/]*/\.\./X/Xg\
sX/\./X/Xg\
sX//X/Xg\
t ok\
/^[^:]*$/p\
:end' |sed -e sX\'X\'\"\'\"\'Xg';sX\!X\\\!Xg' | awk '{printf("%s '"'"'%s'"'"' '"'"'%s'"'"' %s %s %s %s\n","'$0'",$1,"'$INURL'","'$OKFILE'","'$BADFILE'","'$OFFFILE'","'$NEXTNEST'")}' >$TMP/lvrfy.$NEST
# Last sed to escape embedded apostrophes and exclamation marks to avoid conflict.
awk '{print $1, "'$PAGE'"}' $TMP/lvrfy.$OFFFILE.2 >>$OFFFILE
rm $TMP/lvrfy.$OFFFILE.2
#
# Now recurse, or save the work for later, if we're at the maximum depth.
#
if ( $NEST == $MAXNEST ) then
cat $TMP/lvrfy.$NEST >> $TMP/lvrfy.work
rm $TMP/lvrfy.$NEST
else
echo exec rm $TMP/lvrfy.$NEST >> $TMP/lvrfy.$NEST
if ( $NEST > 0 ) then
exec csh -f <$TMP/lvrfy.$NEST
else
csh -f <$TMP/lvrfy.0
#
# Now do any work that we couldn't do at a deeper depth.
#
while ( -f $TMP/lvrfy.work )
mv $TMP/lvrfy.work $TMP/lvrfy.0
echo rm $TMP/lvrfy.0 >> $TMP/lvrfy.0
csh -f <$TMP/lvrfy.0
end
endif
endif