#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.10 2001/01/23 09:20:32 adam Exp $
+# $Id: robot.tcl,v 1.26 2001/11/08 13:49:06 adam Exp $
#
proc RobotFileNext1 {area lead} {
- puts "RobotFileNext1 area=$area lead=$lead"
+ # puts "RobotFileNext1 area=$area lead=$lead"
if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
gets $inf
gets $inf
set distance [string trim [gets $inf]]
- puts "got distance = $distance"
+ # puts "got distance = $distance"
gets $inf
gets $inf
set fromurl [string trim [gets $inf]]
}
proc RobotFileNext {area} {
- global robotSeq global idleTime ns
+ global robotSeq global idletime ns
- puts "RobotFileNext robotSeq=$robotSeq"
+ # puts "RobotFileNext robotSeq=$robotSeq"
if {$robotSeq < 0} {
return {}
}
if {![string length $n]} {
set robotSeq -1
flush stdout
- puts "------------ N E X T R O U N D --------"
+ puts "Round robin"
return wait
}
incr robotSeq
proc RobotFileExist {area host path} {
- puts "RobotFileExist begin area=$area host=$host path=$path"
+ # puts "RobotFileExist begin area=$area host=$host path=$path"
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
- puts "RobotFileExist end npath=$npath"
+ # puts "RobotFileExist end npath=$npath"
return [file exists $npath]
}
proc RobotFileUnlink {area host path} {
- puts "RobotFileUnlink begin"
- puts "area=$area host=$host path=$path"
+ # puts "RobotFileUnlink begin"
+ # puts "area=$area host=$host path=$path"
set lpath [split $path /]
set l [llength $lpath]
incr l -1
set t [lindex $lpath $l]
incr l -1
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
- puts "npath=$npath"
+ # puts "npath=$npath"
set comp [split $npath /]
set l [llength $comp]
incr l -1
if {![catch {glob $path/*}]} return
exec rmdir ./$path
}
- puts "RobotFileUnlink end"
+ # puts "RobotFileUnlink end"
}
proc RobotFileClose {out} {
if {![info exists workdir]} {
return stdout
}
- puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
if {[string compare $orgPwd $workdir]} {
puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
proc RobotStart {} {
global URL
- global robotsRunning robotsMax idleTime
+ global robotsRunning robotsMax idletime
- puts "RobotStart"
+ # puts "RobotStart"
while {1} {
set url [RobotFileNext unvisited]
if {![string length $url]} {
}
incr robotsRunning
if {[string compare $url wait] == 0} {
- after $idleTime RobotRR
+ after $idletime RobotRR
return
}
set r [RobotGetUrl $url {}]
}
proc RobotHref {url hrefx hostx pathx} {
- global URL domains
+ global URL domains debuglevel
upvar $hrefx href
upvar $hostx host
upvar $pathx path
- puts "Ref url = $url href=$href"
+ if {$debuglevel > 1} {
+ puts "Ref input url = $url href=$href"
+ }
+
+ if {[string first { } $href] >= 0} {
+ return 0
+ }
+ if {[string length $href] > 256} {
+ return 0
+ }
+ if {[string first {?} $href] >= 0} {
+ return 0
+ }
+ if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+ return 0
+ }
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
if {![string length $surl]} {
set surl /
}
- set ok 0
- foreach domain $domains {
- if {[string match $domain $host]} {
- set ok 1
- break
+ if {[info exist domains]} {
+ set ok 0
+ foreach domain $domains {
+ if {[string match $domain $host]} {
+ set ok 1
+ break
+ }
}
- }
- if {!$ok} {
- return 0
- }
+ if {!$ok} {
+ return 0
+ }
+ }
} else {
regexp {^([^\#]*)} $hpath x surl
set host $URL($url,hostport)
set surl $dpart/$surl
}
}
- set c [split $surl /]
- set i [llength $c]
- incr i -1
- set path [lindex $c $i]
- incr i -1
- while {$i >= 0} {
- switch -- [lindex $c $i] {
+ set surllist [split $surl /]
+ catch {unset path}
+ set pathl 0
+ foreach c $surllist {
+ switch -- $c {
.. {
- incr i -2
- if {$i < 0} {
- set i 0
+ if {$pathl > 1} {
+ incr pathl -2
+ set path [lrange $path 0 $pathl]
+ incr pathl
}
}
- . {
- incr i -1
- }
- default {
- set path [lindex $c $i]/$path
- incr i -1
+ . {
+
+ }
+ default {
+ incr pathl
+ lappend path $c
}
}
}
+ if {$debuglevel > 4} {
+ puts "pathl=$pathl output path=$path"
+ }
+ set path [join $path /]
+ if {![string length $path]} {
+ set path /
+ }
regsub -all {~} $path {%7E} path
set href "$method://$host$path"
- puts "Ref href = $href"
- return 1
+
+ if {$debuglevel > 1} {
+ puts "Ref result = $href"
+ }
+ return [checkrule url $href]
}
proc RobotError {url code} {
global URL
- puts "Bad URL $url, $code"
+ puts "Bad URL $url (code $code)"
set fromurl {}
set distance -1
if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
}
proc RobotTextHtml {url out} {
- global URL maxDistance
+ global URL maxdistance
set distance 0
- if {$maxDistance < 1000 && [info exists URL($url,dist)]} {
+ if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
set distance [expr $URL($url,dist) + 1]
}
htmlSwitch $URL($url,buf) \
}
puts $out {></meta>}
} body {
- regsub -all -nocase {<script.*</script>} $body {} abody
- regsub -all {<[^\>]+>} $abody {} nbody
+ regsub -all {<!--[^-]*->} $body { } abody
+ regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
+ regsub -all {<[^\>]+>} $bbody {} nbody
puts $out "<documentcontent>"
puts $out $nbody
puts $out "</documentcontent>"
- } a {
+ } -nonest a {
if {![info exists parm(href)]} {
puts "no href"
continue
}
- if {[expr $distance <= $maxDistance]} {
+ if {[expr $distance <= $maxdistance]} {
set href [string trim $parm(href)]
if {![RobotHref $url href host path]} continue
}
}
}
+ } -nonest area {
+ if {![info exists parm(href)]} {
+ puts "no href"
+ continue
+ }
+ if {[expr $distance <= $maxdistance]} {
+ set href [string trim $parm(href)]
+ if {![RobotHref $url href host path]} continue
+
+ puts $out "<cr>"
+ puts $out "<identifier>$href</identifier>"
+ puts $out "<description></description>"
+ puts $out "</cr>"
+
+ if {![RobotFileExist visited $host $path]} {
+ set olddistance 1000
+ if {![RobotFileExist bad $host $path]} {
+ if {[RobotFileExist unvisited $host $path]} {
+ set inf [RobotFileOpen unvisited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ }
+ } else {
+ set olddistance 0
+ }
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ } elseif {[string compare $href $url]} {
+ set inf [RobotFileOpen visited $host $path r]
+ RobotReadRecord $inf xurl olddistance
+ close $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ puts "OK remarking url=$url href=$href"
+ puts "olddistance = $olddistance"
+ puts "newdistance = $distance"
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ }
+ }
}
}
global URL agent
set section 0
foreach l [split $buf \n] {
- if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
+ if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
puts "cmd=$cmd arg=$arg"
- switch $cmd {
- User-Agent {
+ switch -- [string tolower $cmd] {
+ user-agent {
if {$section} break
set pat [string tolower $arg]*
set section [string match $pat $agent]
}
- Disallow {
+ disallow {
if {$section} {
puts "rule [list 0 $arg]"
lappend $v [list 0 $arg]
}
}
- Allow {
+ allow {
if {$section} {
puts "rule [list 1 $arg]"
lappend $v [list 1 $arg]
global URL
puts $out "<documentcontent>"
- puts $out $URL($url,buf)
+ regsub -all {<} $URL($url,buf) {\<} content
+ puts $out $content
puts $out "</documentcontent>"
if {![string compare $URL($url,path) /robots.txt]} {
}
}
-proc Robot200 {url} {
+proc RobotWriteMetadata {url out} {
global URL domains
-
- set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
+
puts $out "<zmbot>"
set distance 1000
headSave $url $out
puts "Parsing $url distance=$distance"
switch $URL($url,head,content-type) {
- text/html {
- if {[string length $distance]} {
- RobotTextHtml $url $out
- }
- }
- text/plain {
- RobotTextPlain $url $out
- }
- application/pdf {
- set pdff [open test.pdf w]
- puts -nonewline $pdff $URL($url,buf)
- close $pdff
- }
+ text/html {
+ if {[string length $distance]} {
+ RobotTextHtml $url $out
+ }
+ }
+ text/plain {
+ RobotTextPlain $url $out
+ }
+ application/pdf {
+ set pdff [open test.pdf w]
+ puts -nonewline $pdff $URL($url,buf)
+ close $pdff
+ }
}
puts $out "</zmbot>"
+}
+
+proc Robot200 {url} {
+ global URL domains
+
+ set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)]
+ puts -nonewline $out $URL($url,buf)
RobotFileClose $out
- # puts "Parsing done"
+
+ if {![checkrule mime $URL($url,head,content-type)]} {
+ RobotError $url mimedeny
+ return
+ }
+ set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
+ RobotWriteMetadata $url $out
+ RobotFileClose $out
+
RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
}
proc RobotReadContent {url sock binary} {
global URL
- puts "RobotReadContent $url"
set buffer [read $sock 16384]
set readCount [string length $buffer]
}
proc RobotReadHeader {url sock} {
- global URL
+ global URL debuglevel
- puts "RobotReadHeader $url"
+ if {$debuglevel > 1} {
+ puts "HTTP head $url"
+ }
if {[catch {set buffer [read $sock 2148]}]} {
RobotError $url 404
RobotRestart $url $sock
regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
set lines [split $headbuf \n]
foreach line $lines {
- if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
+ if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
set URL($url,head,[string tolower $name]) [string trim $value]
}
}
- puts "code = $code"
+ puts "HTTP CODE $code"
set URL($url,state) skip
switch $code {
301 {
}
proc RobotConnect {url sock} {
- global URL agent
+ global URL agent acceptLanguage
fconfigure $sock -translation {lf crlf} -blocking 0
fileevent $sock readable [list RobotReadHeader $url $sock]
puts $sock "GET $URL($url,path) HTTP/1.0"
puts $sock "Host: $URL($url,host)"
puts $sock "User-Agent: $agent"
+ if {[string length $acceptLanguage]} {
+ puts $sock "Accept-Language: $acceptLanguage"
+ }
puts $sock ""
flush $sock
set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]]
proc RobotGetUrl {url phost} {
global URL robotsRunning
flush stdout
- puts "RobotGetUrl --------- robotsRunning=$robotsRunning url=$url"
+ puts "Retrieve $robotsRunning url=$url"
if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
return -1
}
set buf [read $inf 32768]
close $inf
} else {
- set buf "User-Agent: *\nAllow: /\n"
+ set buf "User-agent: *\nAllow: /\n"
}
RobotsTxt0 URL($hostport,robots) $buf
}
}
}
if {!$ok} {
+ puts "skipped due to robots.txt"
return -1
}
}
}
}
-set agent "zmbot/0.0"
+set agent "zmbot/0.1"
if {![catch {set os [exec uname -s -r]}]} {
set agent "$agent ($os)"
- puts "agent: $agent"
}
+puts "agent: $agent"
+
proc bgerror {m} {
global errorInfo
puts "BGERROR $m"
}
set robotsRunning 0
-set robotsMax 5
set robotSeq 0
set workdir [pwd]
-set idleTime 60000
+set idletime 60000
+set acceptLanguage {}
+set debuglevel 0
-if {[llength $argv] < 2} {
- puts "Tclrobot: usage <range> <domain> <start>"
- puts " Example: 3 '*.indexdata.dk' http://www.indexdata.dk/"
- exit 1
+
+# Rules: allow, deny, url
+
+proc checkrule {type this} {
+ global alrules
+ global debuglevel
+
+ if {$debuglevel > 3} {
+ puts "CHECKRULE $type $this"
+ }
+ if {[info exist alrules]} {
+ foreach l $alrules {
+ if {$debuglevel > 3} {
+ puts "consider $l"
+ }
+ # consider type
+ if {[lindex $l 1] != $type} continue
+ # consider mask (! negates)
+ set masks [lindex $l 2]
+ set ok 0
+ foreach mask $masks {
+ if {$debuglevel > 4} {
+ puts "consider single mask $mask"
+ }
+ if {[string index $mask 0] == "!"} {
+ set mask [string range $mask 1 end]
+ if {[string match $mask $this]} continue
+ } else {
+ if {![string match $mask $this]} continue
+ }
+ set ok 1
+ }
+ if {$debuglevel > 4} {
+ puts "ok = $ok"
+ }
+ if {!$ok} continue
+ # OK, we have a match
+ if {[lindex $l 0] == "allow"} {
+ if {$debuglevel > 3} {
+ puts "CHECKRULE MATCH OK"
+ }
+ return 1
+ } else {
+ if {$debuglevel > 3} {
+ puts "CHECKFULE MATCH FAIL"
+ }
+ return 0
+ }
+ }
+ }
+ if {$debuglevel > 3} {
+ puts "CHECKRULE MATCH OK"
+ }
+ return 1
}
-set maxDistance [lindex $argv 0]
-set domains [lindex $argv 1]
-foreach href [lindex $argv 2] {
+
+proc url {href} {
+ global debuglevel
+
if {[RobotHref http://www.indexdata.dk/ href host path]} {
- if {![RobotFileExist visited $host $path]} {
- set outf [RobotFileOpen unvisited $host $path]
- RobotWriteRecord $outf $href 0
- RobotFileClose $outf
+ if {![RobotFileExist visited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
+}
+
+proc deny {type stuff} {
+ global alrules
+
+ lappend alrules [list deny $type $stuff]
+}
+
+proc allow {type stuff} {
+ global alrules
+
+ lappend alrules [list allow $type $stuff]
+}
+
+proc debug {level} {
+ global debuglevel
+
+ set debuglevel $level
+}
+
+# Parse options
+
+set i 0
+set l [llength $argv]
+
+if {$l < 2} {
+ puts {tclrobot: usage:}
+ puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
+ puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
+
+ exit 1
+}
+while {$i < $l} {
+ set arg [lindex $argv $i]
+ switch -glob -- $arg {
+ -j* {
+ set robotsMax [string range $arg 2 end]
+ if {![string length $robotsMax]} {
+ set robotsMax [lindex $argv [incr i]]
+ }
+ }
+ -c* {
+ set maxdistance [string range $arg 2 end]
+ if {![string length $maxdistance]} {
+ set maxdistance [lindex $argv [incr i]]
+ }
+ }
+ -d* {
+ set dom [string range $arg 2 end]
+ if {![string length $dom]} {
+ set dom [lindex $argv [incr i]]
+ }
+ lappend domains $dom
+ }
+ -i* {
+ set idletime [string range $arg 2 end]
+ if {![string length $idletime]} {
+ set idletime [lindex $argv [incr i]]
+ }
+ }
+ -l* {
+ set acceptLanguage [string range $arg 2 end]
+ if {![string length $acceptLanguage]} {
+ set acceptLanguage [lindex $argv [incr i]]
+ }
+ }
+ -r* {
+ set rfile [string range $arg 2 end]
+ if {![string length $rfile]} {
+ set rfile [lindex $argv [incr i]]
+ }
+ source $rfile
+ }
+ default {
+ set href $arg
+ if {[RobotHref http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist visited $host $path]} {
+ set outf [RobotFileOpen unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
}
}
+ incr i
}
+if {![info exist domains]} {
+ set domains {*}
+}
+if {![info exist maxdistance]} {
+ set maxdistance 50
+}
+if {![info exist robotsMax]} {
+ set robotsMax 5
+}
+
+puts "domains=$domains"
+puts "max distance=$maxdistance"
+puts "max jobs=$robotsMax"
+
RobotStart
+
while {$robotsRunning} {
vwait robotsRunning
}