2 # $Id: robot.tcl,v 1.6 2000/12/07 20:16:11 adam Exp $
4 proc RobotFileNext {area} {
5 if {[catch {set ns [glob ${area}/*]}]} {
8 set off [string first / $area]
12 if {[file isfile $n]} {
13 if {[string first :.html $n] > 0} {
14 return http://[string range $area/ $off end]
16 return http://[string range $n $off end]
20 if {[file isdirectory $n]} {
21 set sb [RobotFileNext $n]
22 if {[string length $sb]} {
30 proc RobotFileExist {area host path} {
31 set comp [split $area/$host$path /]
34 if {![string length [lindex $comp $l]]} {
35 set comp [split $area/$host$path:.html /]
36 } elseif {[file exists [join $comp /]]} {
39 set comp [split $area/$host$path/:.html /]
41 return [file exists [join $comp /]]
44 proc RobotFileUnlink {area host path} {
45 set comp [split $area/$host$path /]
48 if {![string length [lindex $comp $l]]} {
49 set comp [split $area/$host$path:.html /]
51 if {[catch {exec rm [join $comp /]}]} return
53 for {set i $l} {$i > 0} {incr i -1} {
54 set path [join [lrange $comp 0 $i] /]
55 if {![catch {glob $path/*}]} return
60 proc RobotFileClose {out} {
61 if [string compare $out stdout] {
66 proc RobotFileOpen {area host path {mode w}} {
70 if {![info exists workdir]} {
73 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path"
74 if {[string compare $orgPwd $workdir]} {
75 puts "workdir = $workdir"
79 set comp [split $area/$host$path /]
80 set len [llength $comp]
82 for {set i 0} {$i < $len} {incr i} {
83 set d [lindex $comp $i]
84 if {[catch {cd ./$d}]} {
89 set d [lindex $comp $len]
90 if {[string length $d]} {
91 if {[file isdirectory $d]} {
92 set out [open $d/:.html $mode]
94 set out [open $d $mode]
97 set out [open :.html $mode]
100 #puts "RobotFileStop"
104 proc RobotRestart {} {
109 set url [RobotFileNext unvisited]
110 if {![string length $url]} {
113 set r [RobotGetUrl $url {}]
115 puts "RobotGetUrl returned 0 on url=$url"
118 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
124 proc headSave {url out} {
128 if {[info exists URL($url,head,last-modified)]} {
129 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
132 if {[info exists URL($url,head,date)]} {
133 puts $out " <date>$URL($url,head,date)</date>"
135 if {[info exists URL($url,head,content-length)]} {
136 puts $out " <by>$URL($url,head,content-length)</by>"
138 if {[info exists URL($url,head,server)]} {
139 puts $out " <format>$URL($url,head,server)</format>"
142 puts $out {<publisher>}
143 puts $out " <identifier>$url</identifier>"
144 if {[info exists URL($url,head,content-type)]} {
145 puts $out " <type>$URL($url,head,content-type)</type>"
147 puts $out {</publisher>}
150 proc RobotHref {url hrefx hostx pathx} {
156 puts "Ref url = $url href=$href"
157 # get method (if any)
158 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
162 if {[string compare $method http]} {
167 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
168 if {![string length $surl]} {
172 foreach domain $domains {
173 if {[string match $domain $host]} {
182 regexp {^([^\#]*)} $hpath x surl
183 set host $URL($url,host)
185 if {![string length $surl]} {
188 if {[string first / $surl]} {
190 regexp {^([^\#?]*)} $URL($url,path) x dpart
191 set l [string last / $dpart]
192 if {[expr $l >= 0]} {
193 set surl [string range $dpart 0 $l]$surl
195 set surl $dpart/$surl
198 set c [split $surl /]
201 set path [lindex $c $i]
204 switch -- [lindex $c $i] {
212 set path [lindex $c $i]/$path
217 set href "$method://$host$path"
218 puts "Ref href = $href"
222 proc Robot401 {url} {
228 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
229 set fromurl [gets $inf]
232 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
233 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
234 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
235 puts $outf "URL=$url 401"
236 puts $outf "Reference $fromurl"
241 proc Robot404 {url} {
247 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
248 set fromurl [gets $inf]
251 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
252 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
253 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
254 puts $outf "URL=$url 404"
255 puts $outf "Reference $fromurl"
260 proc Robot301 {url tourl} {
263 puts "Redirecting from $url to $tourl"
267 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
268 set fromurl [gets $inf]
271 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
272 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
273 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
274 puts $outf "URL=$url to $tourl 301"
275 puts $outf "Reference $fromurl"
278 if {[RobotHref $url tourl host path]} {
279 if {![RobotFileExist unvisited $host $path]} {
280 puts "Mark as unvisited"
281 set outf [RobotFileOpen unvisited $host $path]
288 proc RobotTextHtml {url out} {
292 htmlSwitch $URL($url,buf) \
298 puts $out "<title>$body</title>"
304 puts -nonewline $out "<meta"
305 foreach a [array names parm] {
306 puts -nonewline $out " $a"
307 puts -nonewline $out {="}
308 puts -nonewline $out $parm($a)
309 puts -nonewline $out {"}
313 regsub -all -nocase {<script.*</script>} $body {} abody
314 regsub -all {<[^\>]+>} $abody {} nbody
315 puts $out "<documentcontent>"
317 puts $out "</documentcontent>"
319 if {![info exists parm(href)]} {
329 if {![RobotHref $url href host path]} continue
332 puts $out "<identifier>$href</identifier>"
333 puts $out "<description>$body</description>"
336 if {![RobotFileExist visited $host $path]} {
337 if {![RobotFileExist bad $host $path]} {
338 if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} {
339 puts "--- Error $msg"
355 proc RobotTextPlain {url out} {
359 puts $out "<documentcontent>"
360 puts $out $URL($url,buf)
361 puts $out "</documentcontent>"
365 proc Robot200 {url} {
369 set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
370 switch $URL($url,head,content-type) {
372 RobotTextHtml $url $out
375 RobotTextPlain $url $out
383 # puts "Parsing done"
384 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
387 proc RobotReadContent {url sock} {
390 set buffer [read $sock 16384]
391 set readCount [string length $buffer]
393 if {$readCount <= 0} {
398 # puts "Got $readCount bytes"
399 set URL($url,buf) $URL($url,buf)$buffer
403 proc RobotReadHeader {url sock} {
406 set buffer [read $sock 2148]
407 set readCount [string length $buffer]
409 if {$readCount <= 0} {
414 # puts "Got $readCount bytes"
415 set URL($url,buf) $URL($url,buf)$buffer
417 set n [string first \n\n $URL($url,buf)]
421 set headbuf [string range $URL($url,buf) 0 $n]
424 set URL($url,buf) [string range $URL($url,buf) $n end]
426 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
427 set lines [split $headbuf \n]
428 foreach line $lines {
429 if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
430 set URL($url,head,[string tolower $name]) $value
434 set URL($url,state) skip
437 Robot301 $url $URL($url,head,location)
442 Robot301 $url $URL($url,head,location)
457 if {![info exists URL($url,head,content-type)]} {
458 set URL($url,head,content-type) {}
460 switch $URL($url,head,content-type) {
462 fileevent $sock readable [list RobotReadContent $url $sock]
465 fileevent $sock readable [list RobotReadContent $url $sock]
484 proc RobotConnect {url sock} {
487 fconfigure $sock -translation {auto crlf} -blocking 0
489 fileevent $sock readable [list RobotReadHeader $url $sock]
490 puts $sock "GET $URL($url,path) HTTP/1.0"
491 puts $sock "Host: $URL($url,host)"
492 puts $sock "User-Agent: $agent"
501 proc RobotGetUrl {url phost} {
505 if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
508 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
512 set URL($url,method) $method
513 set URL($url,host) $host
514 set URL($url,port) $port
515 set URL($url,path) $path
516 set URL($url,state) head
518 if [catch {set sock [socket -async $host $port]}] {
521 RobotConnect $url $sock
526 if {![llength [info commands htmlSwitch]]} {
527 set e [info sharedlibextension]
528 if {[catch {load ./tclrobot$e}]} {
534 set agent "zmbot/0.0"
535 if {![catch {set os [exec uname -s -r]}]} {
536 set agent "$agent ($os)"
545 proc RobotRestart {} {
551 set url {http://www.indexdata.dk/zap/}
553 while {$robotMoreWork} {
565 if {[llength $argv] < 2} {
566 puts "Tclrobot: usage <domain> <start>"
567 puts " Example: '*.indexdata.dk' http://www.indexdata.dk/"
571 set domains [lindex $argv 0]
572 set site [lindex $argv 1]
573 if {[string length $site]} {
575 if [RobotGetUrl $site {}] {
577 puts "Couldn't process $site"
579 #set x [RobotFileOpen unvisited $site /robots.txt]
584 while {$robotMoreWork} {