#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.38 2003/06/10 12:12:35 adam Exp $
+# $Id: robot.tcl,v 1.45 2003/06/11 10:29:41 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
foreach n $ns {
if {[file isfile $n]} {
set off [string last / $n]
- incr off 2
- return $lead/[string range $n $off end]
+ # skip /
+ incr off
+ set end [string length $n]
+ # skip _.tkl
+ incr end -6
+ return $lead/[string range $n $off $end]
}
}
foreach n $ns {
if {[file isdirectory $n]} {
set off [string last / $n]
- incr off 2
+ # skip /
+ incr off
set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
proc RobotWriteRecord {outf fromurl distance} {
+ puts $outf {<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>}
puts $outf "<zmbot>"
puts $outf "<distance>"
puts $outf $distance
upvar $distancex distance
gets $inf
gets $inf
+ gets $inf
set distance [string trim [gets $inf]]
# puts "got distance = $distance"
gets $inf
}
proc RobotFileNext {task area} {
- global robotSeq
+ global control
global idletime ns
global status
- # puts "RobotFileNext robotSeq=$robotSeq($task)"
- if {$robotSeq($task) < 0} {
+ # puts "RobotFileNext seq=$control($task,seq)"
+ if {$control($task,seq) < 0} {
return {}
}
- if {$robotSeq($task) == 0} {
+ if {$control($task,seq) == 0} {
if {[catch {set ns($task) [glob $task/$area/*]}]} {
return done
}
# puts "ns=$ns($task)"
set off [string length $task/$area]
incr off
- set n [lindex $ns($task) $robotSeq($task)]
+ set n [lindex $ns($task) $control($task,seq)]
# puts "n=$n"
if {![string length $n]} {
- set robotSeq($task) -1
+ set control($task,seq) -1
flush stdout
set statusfile [open $task/status w]
puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
close $statusfile
return wait
}
- incr robotSeq($task)
- if {[file isfile $n/frobots.txt]} {
- puts "ok returning http://[string range $n $off end]/robots.txt"
+ incr control($task,seq)
+ if {[file isfile $n/robots.txt_.tkl]} {
+ # puts "ok returning http://[string range $n $off end]/robots.txt"
return http://[string range $n $off end]/robots.txt
} elseif {[file isdirectory $n]} {
set sb [RobotFileNext1 $n http://[string range $n $off end]]
if {$debuglevel > 3} {
puts "RobotFileExist begin area=$area host=$host path=$path"
}
- set lpath [split $path /]
- set l [llength $lpath]
- incr l -1
- set t [lindex $lpath $l]
- incr l -1
- set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
- if {$debuglevel > 3} {
- puts "RobotFileExist end npath=$npath"
- }
- return [file exists $npath]
+ return [file exists $task/$area/$host${path}_.tkl]
}
proc RobotFileUnlink {task area host path} {
global status
# puts "RobotFileUnlink begin"
# puts "area=$area host=$host path=$path"
- set lpath [split $path /]
- set l [llength $lpath]
- incr l -1
- set t [lindex $lpath $l]
- incr l -1
- set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ set npath $task/$area/$host${path}_.tkl
# puts "npath=$npath"
set comp [split $npath /]
- if {[catch {exec rm [join $comp /]}]} return
+ if {[catch {exec rm $npath}]} return
set l [llength $comp]
- incr l -1
- incr l -1
+ incr l -2
incr status($task,$area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
- exec rmdir ./$path
+ exec rmdir $path
}
# puts "RobotFileUnlink end"
}
global status
global debuglevel
- puts "RobotFileOpen task=$task path=$path"
+ # puts "RobotFileOpen task=$task path=$path"
if {![info exists workdir]} {
return stdout
if {[string length $d] == 0} {
cd /
} elseif {[catch {cd $d}]} {
- puts "mkdir (1) d=$d"
exec mkdir $d
cd ./$d
if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
if {[string compare $path /robots.txt]} {
- set out [open frobots.txt w]
+ set out [open robots.txt_.tkl w]
puts "creating robots.txt in $d"
close $out
incr status($task,unvisited)
# puts "2 path=$path comp=$comp"
for {set i 0} {$i < $len} {incr i} {
- set d "d[lindex $comp $i]"
- if {[string length $d] > 1} {
+ set d [lindex $comp $i]
+ if {[string length $d] > 0} {
if {[catch {cd $d}]} {
- puts "mkdir (2) d=$d"
exec mkdir $d
cd ./$d
}
}
}
set d [lindex $comp $len]
- if {[string length $d]} {
- set out [open f$d $mode]
- } else {
- set out [open f $mode]
- }
+ set out [open ${d}_.tkl $mode]
if {$mode == "w"} {
incr status($task,$area)
}
return $out
}
-
proc RobotStartJob {fname t} {
global control
-
set f [open $fname r]
set xml [read $f]
puts "Reading $fname"
}
proc RobotRR {task} {
- global robotSeq robotsRunning tasks robotsMax status
+ global control robotsRunning tasks robotsMax status
puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
incr robotsRunning -1
set statusfile [open $t/status w]
puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
close $statusfile
- set robotSeq($t) 0
+ set control($t,seq) 0
RobotStart $t
}
}
if {[info exists tasks]} {
puts "daemon loop tasks $tasks"
foreach t $tasks {
- set robotSeq($t) 0
+ set control($t,seq) 0
RobotStart $t
}
while {$robotsRunning} {
}
}
+proc wellform {body} {
+ regsub -all {<!--[^-]*-->} $body { } abody
+ regsub -all -nocase {<script[^<]*</script>} $abody {} body
+ regsub -all {<[^\>]+>} $body {} abody
+ regsub -all { } $abody { } body
+ regsub -all {&} $body {&} abody
+ return $abody
+}
+
proc link {task url out href body distance} {
global URL control
if {[expr $distance > $control($task,distance)]} return
puts $out "<cr>"
puts $out "<identifier>$href</identifier>"
- puts $out "<description>$body</description>"
+ set abody [wellform $body]
+ puts $out "<description>$abody</description>"
puts $out "</cr>"
if {![RobotFileExist $task visited $host $path]} {
# don't print title of document content if noindex is used
if {!$noindex} {
puts $out "<title>$title</title>"
- regsub -all {<!--[^-]*-->} $body { } abody
- regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
- regsub -all {<[^\>]+>} $bbody {} nbody
+ set bbody [wellform $body]
puts $out "<documentcontent>"
- puts $out $nbody
+ puts $out $bbody
puts $out "</documentcontent>"
}
} -nonest base {
proc RobotWriteMetadata {task url out} {
global URL
+ set charset $URL($task,$url,charset)
+ puts $out "<?xml version=\"1.0\" encoding=\"$charset\" standalone=\"yes\"?>"
puts $out "<zmbot>"
set distance 1000
set version {}
set headbuf [string range $URL($task,$url,buf) 0 $n]
incr n 4
+ set URL($task,$url,charset) ISO-8859-1
set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
set URL($task,$url,head,[string tolower $name]) [string trim $value]
}
+ regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset)
}
puts "HTTP CODE $code"
set URL($task,$url,state) skip
global control
global debuglevel
+ set default_ret 1
+
if {$debuglevel > 3} {
puts "CHECKRULE $type $this"
}
# consider mask (! negates)
set masks [lindex $l 2]
set ok 0
+ set default_ret 0
foreach mask $masks {
if {$debuglevel > 4} {
puts "consider single mask $mask"
}
}
if {$debuglevel > 3} {
- puts "CHECKRULE MATCH OK"
+ puts "CHECKRULE MATCH DEFAULT $default_ret"
}
- return 1
+ return $default_ret
}
}
proc task {t} {
- global tasks task status robotSeq control
+ global tasks task status control
set task $t
set status($t,bad) 0
set status($t,raw) 0
set status($t,active) 1
- set robotSeq($t) 0
+ set control($t,seq) 0
set control($t,distance) 10
return 1
}