#!/usr/bin/tclsh
# $Id: tkl-web-harvester,v 1.1 2003/08/14 08:02:10 marc Exp $
#
set loghandle stdout
set robotsRunning 0
set workdir [pwd]
set idletime 15000
set acceptLanguage {}
set debuglevel 1
set libdir ""
proc logmsg {msg} {
global loghandle
puts $loghandle $msg
flush $loghandle
}
#proc dbgmsg {level msg} {
# global debuglevel
# if {[expr $debuglevel >= $level]} {
# logmsg $msg
# }
#}
proc dbgmsg {msg} {
global debuglevel
if {[expr $debuglevel >= 0]} {
logmsg $msg
}
}
# dbgmsg is always called with just one string!
proc fnameEncode {fname} {
regsub -all {&} $fname {%38} fname
regsub -all {<} $fname {%3C} fname
regsub -all {>} $fname {%3E} fname
regsub -all {\?} $fname {%3F} fname
regsub -all {\*} $fname {%2A} fname
return $fname
}
proc fnameDecode {fname} {
regsub -all {%38} $fname {&} fname
regsub -all {%3C} $fname {<} fname
regsub -all {%3E} $fname {>} fname
regsub -all {%3F} $fname {?} fname
regsub -all {%2A} $fname {*} fname
return $fname
}
proc RobotFileNext1 {area lead} {
# dbgmsg "RobotFileNext1 area=$area lead=$lead"
if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
foreach n $ns {
if {[file isfile $n]} {
set off [string last / $n]
# skip /
incr off
set end [string length $n]
# skip _.tkl
incr end -6
return $lead/[string range $n $off $end]
}
}
foreach n $ns {
if {[file isdirectory $n]} {
set off [string last / $n]
# skip /
incr off
set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
}
}
return {}
}
proc RobotWriteRecord {outf fromurl distance} {
puts $outf {}
puts $outf ""
puts $outf ""
puts $outf $distance
puts $outf ""
puts $outf ""
puts $outf $fromurl
puts $outf ""
puts $outf ""
}
proc RobotReadRecord {inf fromurlx distancex} {
upvar $fromurlx fromurl
upvar $distancex distance
gets $inf
gets $inf
gets $inf
set distance [string trim [gets $inf]]
# dbgmsg "got distance = $distance"
gets $inf
gets $inf
set fromurl [string trim [gets $inf]]
}
proc RobotFileNext {task area} {
global control
global idletime ns
global status
# dbgmsg "RobotFileNext seq=$control($task,seq)"
if {$control($task,seq) < 0} {
return {}
}
set target $control($task,target)
if {$control($task,seq) == 0} {
if {[catch {set ns($task) [glob $target/$area/*]}]} {
puts "----------- DONE-------- target=$target"
return done
}
}
# dbgmsg "ns=$ns($task)"
set off [string length $target/$area]
incr off
set n [lindex $ns($task) $control($task,seq)]
# dbgmsg "n=$n"
if {![string length $n]} {
set control($task,seq) -1
set statusfile [open $target/status w]
puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
close $statusfile
return wait
}
incr control($task,seq)
if {[file isfile $n/robots.txt_.tkl]} {
# dbgmsg "ok returning http://[string range $n $off end]/robots.txt"
return [fnameDecode http://[string range $n $off end]/robots.txt]
} elseif {[file isdirectory $n]} {
set sb [RobotFileNext1 $n http://[string range $n $off end]]
if {[string length $sb]} {
return [fnameDecode $sb]
}
}
dbgmsg "no more work at end of RobotFileNext n=$n"
dbgmsg "ns=$ns($task)"
return {}
}
proc RobotFileExist {task area host path} {
global debuglevel control
if {$debuglevel > 3} {
dbgmsg "RobotFileExist begin area=$area host=$host path=$path"
}
set target $control($task,target)
return [file exists [fnameEncode $target/$area/$host${path}_.tkl]]
}
proc RobotFileUnlink {task area host path} {
global status control
set target $control($task,target)
# dbgmsg "RobotFileUnlink begin"
# dbgmsg "area=$area host=$host path=$path"
set npath [fnameEncode $target/$area/$host${path}_.tkl]
# dbgmsg "npath=$npath"
set comp [split $npath /]
if {[catch {exec rm $npath}]} return
set l [llength $comp]
incr l -2
incr status($task,$area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
exec rmdir $path
}
# dbgmsg "RobotFileUnlink end"
}
proc RobotFileClose {out} {
if [string compare $out stdout] {
close $out
}
}
proc RobotFileOpen {task area host path {mode w}} {
set orgPwd [pwd]
global workdir status debuglevel control
# dbgmsg "RobotFileOpen task=$task path=$path"
set target $control($task,target)
set path [fnameEncode $path]
if {![info exists workdir]} {
return stdout
}
if {$debuglevel > 3} {
dbgmsg "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
}
if {[string compare $orgPwd $workdir]} {
dbgmsg "ooops. RobotFileOpen failed"
dbgmsg "workdir = $workdir"
dbgmsg "pwd = $orgPwd"
exit 1
}
set comp [split $target/$area/$host /]
set len [llength $comp]
incr len -1
# dbgmsg "1 comp=$comp"
for {set i 0} {$i <= $len} {incr i} {
set d [lindex $comp $i]
if {[string length $d] == 0} {
cd /
} elseif {[catch {cd $d}]} {
exec mkdir $d
cd ./$d
if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
if {[string compare $path /robots.txt]} {
set out [open robots.txt_.tkl w]
dbgmsg "creating robots.txt in $d"
close $out
incr status($task,unvisited)
}
}
}
}
set comp [split $path /]
set len [llength $comp]
incr len -1
# dbgmsg "2 path=$path comp=$comp"
for {set i 0} {$i < $len} {incr i} {
set d [lindex $comp $i]
if {[string length $d] > 0} {
if {[catch {cd $d}]} {
exec mkdir $d
cd ./$d
}
}
}
set d [lindex $comp $len]
set out [open ${d}_.tkl $mode]
if {$mode == "w"} {
incr status($task,$area)
}
cd $orgPwd
return $out
}
proc RobotStartJob {root task} {
global control
set fname "$root$task"
set f [open $fname r]
set xml [read $f]
dbgmsg "Reading $fname"
close $f
# task type must be 2
if {![regexp {([^<]*)} $xml x tasktype]} {
return
}
set tasktype [string trim $tasktype]
if {![string match 2 $tasktype]} {
return
}
# status must not be finished or error
if {![regexp {([^<]*)} $xml x status]} {
return
}
if {$status == "finished"} {
dbgmsg "already finished"
return
}
if {$status == "error"} {
dbgmsg "already finished due to error"
return
}
# ignore if task has already been processed
dbgmsg "status = $status"
if {![CreateTask $task]} {
return
}
set control($task,taskfname) $fname
dbgmsg "Reading $fname stage 2"
htmlSwitch $xml \
url {
lappend starturls $body
} filter {
set type $parm(type)
set action $parm(action)
if {$type == "domain"} {
$action url http://$body/*
}
if {$type == "url"} {
$action url $body
}
if {$type == "mime"} {
$action mime $body
}
} target {
set ex [file rootname [file tail $task]]
#set control($task,target) "$root$body/$ex"
set control($task,target) "$control(tmpdir)/$ex"
set control($task,output) "$root$body"
} distance {
set control($task,distance) $body
} status {
set control($task,filestatus) $body
} tasktype {
set control($task,tasktype) $body
}
if {[info exists starturls]} {
foreach url $starturls {
puts "marking start urls $url"
url $url
}
}
if {$status == "pending"} {
regsub {[^<]*} $xml {running} xml2
set f [open $fname w]
puts -nonewline $f $xml2
close $f
}
}
proc RobotDoneJob {task} {
global daemon_dir control
if {![info exists daemon_dir]} {
return
}
set fname $control($task,taskfname)
set f [open $fname r]
set xml [read $f]
dbgmsg "Reading $fname"
regexp {([^<]*)} $xml x status
dbgmsg "------"
dbgmsg "status = $status"
close $f
regsub {[^<]*} $xml {finished} xml2
set f [open $fname w]
puts -nonewline $f $xml2
close $f
}
proc RobotScanDir {} {
global daemon_dir
if {![info exists daemon_dir]} {
return
}
foreach d $daemon_dir {
if {[catch {set files [glob $d/*.spl]}]} {
return
}
foreach fname $files {
if {[file isfile $fname] && [file readable $fname]} {
set jobfile [open $fname]
gets $jobfile portalroot
gets $jobfile portaltask
close $jobfile
RobotStartJob $portalroot $portaltask
}
}
}
}
proc RobotRR {task} {
global control robotsRunning tasks robotsMax status
dbgmsg "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
incr robotsRunning -1
# only one task gets through...
if {[string compare [lindex $tasks 0] $task]} {
return
}
dbgmsg "RobotRR. task = $task"
while {$robotsRunning} {
vwait robotsRunning
}
dbgmsg "Scan"
if {[catch {RobotScanDir} msg]} {
logmsg "RobotScanDir failed"
logmsg $msg
}
foreach t $tasks {
set target $control($t,target)
set statusfile [open $target/status w]
puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
close $statusfile
set control($t,seq) 0
RobotStart $t
}
}
proc RobotDaemonSig {} {
global daemon_cnt
incr daemon_cnt
}
proc RobotDaemonLoop {} {
global daemon_cnt tasks robotsRunning status
set daemon_cnt 0
while 1 {
logmsg $daemon_cnt
RobotScanDir
if {[info exists tasks]} {
logmsg "daemon loop tasks $tasks"
foreach t $tasks {
set control($t,seq) 0
RobotStart $t
}
while {$robotsRunning} {
vwait robotsRunning
}
}
after 30000 RobotDaemonSig
vwait daemon_cnt
}
}
proc RobotRestart {task url sock} {
global URL robotsRunning
close $sock
after cancel $URL($sock,cancel)
foreach v [array names URL $task,$url,*] {
unset URL($v)
}
incr robotsRunning -1
RobotStart $task
}
proc RobotStart {task} {
global URL
global robotsRunning robotsMax idletime status tasks
# dbgmsg "RobotStart $task running=$robotsRunning"
while {1} {
set url [RobotFileNext $task unvisited]
if {[string compare $url done] == 0} {
dbgmsg "In RobotStart task $task done"
catch {unset ntasks}
foreach t $tasks {
if {[string compare $t $task]} {
lappend ntasks $t
} else {
dbgmsg "task $t done"
}
}
if {![info exists ntasks]} {
unset tasks
dbgmsg "all done"
} else {
set tasks $ntasks
}
RobotDoneJob $task
return
}
if {![string length $url]} {
return
}
incr robotsRunning
if {[string compare $url wait] == 0} {
after $idletime [list RobotRR $task]
return
}
set r [RobotGetUrl $task $url {}]
if {!$r} {
if {$robotsRunning >= $robotsMax} return
} else {
incr robotsRunning -1
if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
RobotFileClose $outf
}
RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
}
}
}
proc headSave {task url out} {
global URL
if {[info exists URL($task,$url,head,last-modified)]} {
puts $out "$URL($task,$url,head,last-modified)"
}
puts $out {}
if {[info exists URL($task,$url,head,date)]} {
puts $out " $URL($task,$url,head,date)"
}
if {[info exists URL($task,$url,head,content-length)]} {
puts $out " $URL($task,$url,head,content-length)"
}
if {[info exists URL($task,$url,head,server)]} {
puts $out " $URL($task,$url,head,server)"
}
puts $out {}
puts $out {}
puts $out " $url"
if {[info exists URL($task,$url,head,content-type)]} {
puts $out " $URL($task,$url,head,content-type)"
}
puts $out {}
}
proc RobotHref {task url hrefx hostx pathx} {
global URL control debuglevel
upvar $hrefx href
upvar $hostx host
upvar $pathx path
if {$debuglevel > 1} {
dbgmsg "Ref input url = $url href=$href"
}
if {[string first { } $href] >= 0} {
return 0
}
if {[string length $href] > 256} {
return 0
}
# Skip pages that have ? in them
# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
# return 0
# }
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
set method http
} else {
if {[string compare $method http]} {
return 0
}
}
# get host (if any)
if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
if {![string length $surl]} {
set surl /
}
if {[info exist control($task,domains)]} {
set ok 0
foreach domain $control($task,domains) {
if {[string match $domain $host]} {
set ok 1
break
}
}
if {!$ok} {
return 0
}
}
} else {
regexp {^([^\#]*)} $hpath x surl
set host $URL($task,$url,hostport)
}
if {![string length $surl]} {
return 0
}
if {[string first / $surl]} {
# relative path
set curpath $URL($task,$url,path)
if {[info exists URL($task,$url,bpath)]} {
set curpath $URL($task,$url,bpath)
}
regexp {^([^\#?]*)} $curpath x dpart
set l [string last / $dpart]
if {[expr $l >= 0]} {
set surl [string range $dpart 0 $l]$surl
} else {
set surl $dpart/$surl
}
}
set surllist [split $surl /]
catch {unset path}
set pathl 0
foreach c $surllist {
switch -- $c {
.. {
if {$pathl > 1} {
incr pathl -2
set path [lrange $path 0 $pathl]
incr pathl
}
}
. {
}
default {
incr pathl
lappend path $c
}
}
}
if {$debuglevel > 4} {
dbgmsg "pathl=$pathl output path=$path"
}
set path [join $path /]
if {![string length $path]} {
set path /
}
regsub -all {~} $path {%7E} path
set href "$method://$host$path"
if {$debuglevel > 1} {
dbgmsg "Ref result = $href"
}
return [checkrule $task url $href]
}
proc RobotError {task url code} {
global URL
dbgmsg "Bad URL $url (code $code)"
set fromurl {}
set distance -1
if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
RobotReadRecord $inf fromurl distance
RobotFileClose $inf
}
RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
}
proc RobotRedirect {task url tourl code} {
global URL
dbgmsg "Redirecting from $url to $tourl"
set distance {}
set fromurl {}
if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
RobotReadRecord $inf fromurl distance
RobotFileClose $inf
}
if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
if {[RobotHref $task $url tourl host path]} {
if {![RobotFileExist $task visited $host $path]} {
if {![RobotFileExist $task unvisited $host $path]} {
set outf [RobotFileOpen $task unvisited $host $path]
RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
} else {
set olddistance {}
set inf [RobotFileOpen $task visited $host $path r]
RobotReadRecord $inf oldurl olddistance
RobotFileClose $inf
if {[string length $olddistance] == 0} {
set olddistance 1000
}
if {[string length $distance] == 0} {
set distance 1000
}
dbgmsg "distance=$distance olddistance=$olddistance"
if {[expr $distance < $olddistance]} {
set outf [RobotFileOpen $task unvisited $host $path]
RobotWriteRecord $outf $tourl $distance
RobotFileClose $outf
}
}
}
if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
dbgmsg "unlink failed"
exit 1
}
}
proc wellform {body} {
regsub -all {} $body { } abody
regsub -all -nocase {