#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.21 2001/10/26 13:26:11 adam Exp $
+# $Id: robot.tcl,v 1.34 2002/06/18 19:57:53 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
}
proc RobotFileNext {area} {
- global robotSeq global idletime ns
+ global robotSeq
+ global idletime ns
+ global status
# puts "RobotFileNext robotSeq=$robotSeq"
if {$robotSeq < 0} {
if {![string length $n]} {
set robotSeq -1
flush stdout
- puts "Round robin"
+ set statusfile [open status w]
+ puts $statusfile "$status(unvisited) $status(bad) $status(visited)"
+ close $statusfile
return wait
}
incr robotSeq
proc RobotFileExist {area host path} {
- # puts "RobotFileExist begin area=$area host=$host path=$path"
+ global debuglevel
+
+ if {$debuglevel > 3} {
+ puts "RobotFileExist begin area=$area host=$host path=$path"
+ }
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
- # puts "RobotFileExist end npath=$npath"
+ if {$debuglevel > 3} {
+ puts "RobotFileExist end npath=$npath"
+ }
return [file exists $npath]
}
proc RobotFileUnlink {area host path} {
+ global status
# puts "RobotFileUnlink begin"
# puts "area=$area host=$host path=$path"
set lpath [split $path /]
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
# puts "npath=$npath"
set comp [split $npath /]
+ if {[catch {exec rm [join $comp /]}]} return
+
set l [llength $comp]
incr l -1
- if {[catch {exec rm [join $comp /]}]} return
incr l -1
+ incr status($area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
proc RobotFileOpen {area host path {mode w}} {
set orgPwd [pwd]
global workdir
+ global status
+ global debuglevel
if {![info exists workdir]} {
return stdout
}
- #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ if {$debuglevel > 3} {
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ }
if {[string compare $orgPwd $workdir]} {
puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
exec mkdir $d
cd ./$d
if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
- set out [open frobots.txt w]
- puts "creating robots.txt in $d"
- close $out
+ if {[string compare $path /robots.txt]} {
+ set out [open frobots.txt w]
+ puts "creating robots.txt in $d"
+ close $out
+ incr status(unvisited)
+ }
}
}
}
set d [lindex $comp $len]
if {[string length $d]} {
- if {[file isdirectory $d]} {
- set out [open $d/f $mode]
- } else {
- set out [open f$d $mode]
- }
+ set out [open f$d $mode]
} else {
set out [open f $mode]
}
+ if {$mode == "w"} {
+ incr status($area)
+ }
cd $orgPwd
return $out
}
if {[string length $href] > 256} {
return 0
}
- if {[string first {?} $href] >= 0} {
- return 0
- }
- if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
- return 0
- }
+
+# Skip pages that have ? in them
+# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+# return 0
+# }
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
}
if {[string first / $surl]} {
# relative path
- regexp {^([^\#?]*)} $URL($url,path) x dpart
+ set curpath $URL($url,path)
+ if {[info exists URL($url,bpath)]} {
+ set curpath $URL($url,bpath)
+ }
+ regexp {^([^\#?]*)} $curpath x dpart
set l [string last / $dpart]
if {[expr $l >= 0]} {
set surl [string range $dpart 0 $l]$surl
foreach c $surllist {
switch -- $c {
.. {
- if {$pathl > 0} {
+ if {$pathl > 1} {
incr pathl -2
set path [lrange $path 0 $pathl]
incr pathl
}
}
}
- if {$pathl} {
- set path [join $path /]
- } else {
- set path ""
+ if {$debuglevel > 4} {
+ puts "pathl=$pathl output path=$path"
+ }
+ set path [join $path /]
+ if {![string length $path]} {
+ set path /
}
regsub -all {~} $path {%7E} path
set href "$method://$host$path"
}
}
+proc link {url out href body distance} {
+ global URL maxdistance
+ if {[expr $distance > $maxdistance]} return
+
+ if {![RobotHref $url href host path]} return
+
+ puts $out "<cr>"
+ puts $out "<identifier>$href</identifier>"
+ puts $out "<description>$body</description>"
+ puts $out "</cr>"
+
+ if {![RobotFileExist visited $host $path]} {
+ set olddistance 1000
+ if {![RobotFileExist bad $host $path]} {
+ if {[RobotFileExist unvisited $host $path]} {
+ set inf [RobotFileOpen unvisited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ }
+ } else {
+ set olddistance 0
+ }
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ } elseif {[string compare $href $url]} {
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf xurl olddistance
+ close $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ puts "OK remarking url=$url href=$href"
+ puts "olddistance = $olddistance"
+ puts "newdistance = $distance"
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ }
+}
+
proc RobotTextHtml {url out} {
global URL maxdistance
+ # set title so we can emit it for the body
+ set title {}
+ # if true, nothing will be indexed
+ set noindex 0
+ # if true, nothing will be followed
+ set nofollow 0
+
set distance 0
+ set fdistance 0
if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
- set distance [expr $URL($url,dist) + 1]
+ set fdistance $URL($url,dist)
+ set distance [expr $fdistance + 1]
}
htmlSwitch $URL($url,buf) \
title {
- puts $out "<title>$body</title>"
+ set title $body
} -nonest meta {
+ # collect metadata and save NAME= CONTENT=..
+ set metaname {}
+ set metacontent {}
puts -nonewline $out "<meta"
- foreach a [array names parm] {
- puts -nonewline $out " $a"
+ set al [array names parm]
+ foreach a $al {
+ set al [string tolower $a]
+ puts -nonewline $out " $al"
puts -nonewline $out {="}
puts -nonewline $out $parm($a)
puts -nonewline $out {"}
+ switch -- $al {
+ "name" {
+ set metaname [string tolower $parm($a)]
+ }
+ "content" {
+ set metacontent $parm($a)
+ }
+ }
+ unset parm($al)
+ }
+ puts $out "></meta>"
+ # go through robots directives (af any)
+ if {![string compare $metaname robots]} {
+ set direcs [split [string tolower $metacontent] ,]
+ if {[lsearch $direcs noindex] >= 0} {
+ set noindex 1
+ }
+ if {[lsearch $direcs nofollow] >= 0} {
+ set nofollow 1
+ }
}
- puts $out {></meta>}
} body {
- regsub -all -nocase {<script([^<]|(<!.*>))*</script>} $body {} abody
- regsub -all {<[^\>]+>} $abody {} nbody
- puts $out "<documentcontent>"
- puts $out $nbody
- puts $out "</documentcontent>"
- } -nonest a {
+ # don't print title of document content if noindex is used
+ if {!$noindex} {
+ puts $out "<title>$title</title>"
+ regsub -all {<!--[^-]*-->} $body { } abody
+ regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
+ regsub -all {<[^\>]+>} $bbody {} nbody
+ puts $out "<documentcontent>"
+ puts $out $nbody
+ puts $out "</documentcontent>"
+ }
+ } -nonest base {
+ # <base href=.. >
if {![info exists parm(href)]} {
- puts "no href"
continue
}
- if {[expr $distance <= $maxdistance]} {
- set href [string trim $parm(href)]
- if {![RobotHref $url href host path]} continue
-
- puts $out "<cr>"
- puts $out "<identifier>$href</identifier>"
- puts $out "<description>$body</description>"
- puts $out "</cr>"
-
- if {![RobotFileExist visited $host $path]} {
- set olddistance 1000
- if {![RobotFileExist bad $host $path]} {
- if {[RobotFileExist unvisited $host $path]} {
- set inf [RobotFileOpen unvisited $host $path r]
- RobotReadRecord $inf oldurl olddistance
- RobotFileClose $inf
- }
- } else {
- set olddistance 0
- }
- if {[string length $olddistance] == 0} {
- set olddistance 1000
- }
- if {[expr $distance < $olddistance]} {
- set outf [RobotFileOpen unvisited $host $path]
- RobotWriteRecord $outf $url $distance
- RobotFileClose $outf
- }
- } elseif {[string compare $href $url]} {
- set inf [RobotFileOpen visited $host $path r]
- RobotReadRecord $inf xurl olddistance
- close $inf
- if {[string length $olddistance] == 0} {
- set olddistance 1000
- }
- if {[expr $distance < $olddistance]} {
- puts "OK remarking url=$url href=$href"
- puts "olddistance = $olddistance"
- puts "newdistance = $distance"
- set outf [RobotFileOpen unvisited $host $path]
- RobotWriteRecord $outf $url $distance
- RobotFileClose $outf
- }
- }
- }
+ set href [string trim $parm(href)]
+ if {![RobotHref $url href host path]} continue
+ set URL($url,bpath) $path
+ } a {
+ # <a href="...."> .. </a>
+ # we're not using nonest - otherwise body isn't set
+ if {$nofollow} continue
+ if {![info exists parm(href)]} {
+ continue
+ }
+ link $url $out [string trim $parm(href)] $body $distance
} -nonest area {
+ if {$nofollow} continue
if {![info exists parm(href)]} {
- puts "no href"
continue
}
- if {[expr $distance <= $maxdistance]} {
- set href [string trim $parm(href)]
- if {![RobotHref $url href host path]} continue
-
- puts $out "<cr>"
- puts $out "<identifier>$href</identifier>"
- puts $out "<description></description>"
- puts $out "</cr>"
-
- if {![RobotFileExist visited $host $path]} {
- set olddistance 1000
- if {![RobotFileExist bad $host $path]} {
- if {[RobotFileExist unvisited $host $path]} {
- set inf [RobotFileOpen unvisited $host $path r]
- RobotReadRecord $inf oldurl olddistance
- RobotFileClose $inf
- }
- } else {
- set olddistance 0
- }
- if {[string length $olddistance] == 0} {
- set olddistance 1000
- }
- if {[expr $distance < $olddistance]} {
- set outf [RobotFileOpen unvisited $host $path]
- RobotWriteRecord $outf $url $distance
- RobotFileClose $outf
- }
- } elseif {[string compare $href $url]} {
- set inf [RobotFileOpen visited $host $path r]
- RobotReadRecord $inf xurl olddistance
- close $inf
- if {[string length $olddistance] == 0} {
- set olddistance 1000
- }
- if {[expr $distance < $olddistance]} {
- puts "OK remarking url=$url href=$href"
- puts "olddistance = $olddistance"
- puts "newdistance = $distance"
- set outf [RobotFileOpen unvisited $host $path]
- RobotWriteRecord $outf $url $distance
- RobotFileClose $outf
- }
- }
- }
+ link $url $out [string trim $parm(href)] $body $distance
+ } -nonest frame {
+ if {![info exists parm(src)]} {
+ continue
+ }
+ link $url $out [string trim $parm(src)] $body $fdistance
}
}
}
}
-proc Robot200 {url} {
+proc RobotWriteMetadata {url out} {
global URL domains
-
- set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)]
- puts -nonewline $out $URL($url,buf)
- RobotFileClose $out
- if {![checkrule mime $URL($url,head,content-type)]} {
- RobotError $url mimedeny
- return
- }
-
- set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
puts $out "<zmbot>"
set distance 1000
text/plain {
RobotTextPlain $url $out
}
- application/pdf {
- set pdff [open test.pdf w]
- puts -nonewline $pdff $URL($url,buf)
- close $pdff
- }
}
puts $out "</zmbot>"
+}
+
+proc Robot200 {url} {
+ global URL domains
+
+ set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)]
+ puts -nonewline $out $URL($url,buf)
RobotFileClose $out
- # puts "Parsing done"
+
+ set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
+ RobotWriteMetadata $url $out
+ RobotFileClose $out
+
RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
}
if {[catch {set buffer [read $sock 2148]}]} {
RobotError $url 404
RobotRestart $url $sock
+ return
}
set readCount [string length $buffer]
if {![info exists URL($url,head,content-type)]} {
set URL($url,head,content-type) {}
}
- set binary 0
- switch $URL($url,head,content-type) {
- application/pdf {
- set binary 1
+ set binary 1
+ switch -glob -- $URL($url,head,content-type) {
+ text/* {
+ set binary 0
}
}
+ if {![regexp {/robots.txt$} $url]} {
+ if {![checkrule mime $URL($url,head,content-type)]} {
+ RobotError $url mimedeny
+ RobotRestart $url $sock
+ return
+ }
+ }
fileevent $sock readable [list RobotReadContent $url $sock $binary]
}
default {
set workdir [pwd]
set idletime 60000
set acceptLanguage {}
+set debuglevel 0
+set status(unvisited) 0
+set status(visited) 0
+set status(bad) 0
+set status(raw) 0
-set i 0
-set l [llength $argv]
-
-if {$l < 2} {
- puts {tclrobot: usage:}
- puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
- puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
- exit 1
-}
# Rules: allow, deny, url
-set debuglevel 0
proc checkrule {type this} {
global alrules
}
# consider type
if {[lindex $l 1] != $type} continue
- # consider mask
- if {![string match [lindex $l 2] $this]} continue
+ # consider mask (! negates)
+ set masks [lindex $l 2]
+ set ok 0
+ foreach mask $masks {
+ if {$debuglevel > 4} {
+ puts "consider single mask $mask"
+ }
+ if {[string index $mask 0] == "!"} {
+ set mask [string range $mask 1 end]
+ if {[string match $mask $this]} continue
+ } else {
+ if {![string match $mask $this]} continue
+ }
+ set ok 1
+ }
+ if {$debuglevel > 4} {
+ puts "ok = $ok"
+ }
+ if {!$ok} continue
# OK, we have a match
if {[lindex $l 0] == "allow"} {
if {$debuglevel > 3} {
- puts "CHECKRULE MATH OK"
+ puts "CHECKRULE MATCH OK"
}
return 1
} else {
}
}
if {$debuglevel > 3} {
- puts "CHECKRULE MATH OK"
+ puts "CHECKRULE MATCH OK"
}
return 1
}
# Parse options
+set i 0
+set l [llength $argv]
+
+if {$l < 2} {
+ puts {tclrobot: usage:}
+ puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
+ puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
+
+ exit 1
+}
while {$i < $l} {
set arg [lindex $argv $i]
switch -glob -- $arg {
puts "max distance=$maxdistance"
puts "max jobs=$robotsMax"
+
RobotStart
+
while {$robotsRunning} {
vwait robotsRunning
}
+
+set statusfile [open status w]
+puts $statusfile "$status(unvisited) $status(bad) $status(visited)"
+close $statusfile
+