2 # $Id: robot.tcl,v 1.7 2000/12/08 22:46:53 adam Exp $
4 proc RobotFileNext1 {area} {
5 if {[catch {set ns [glob ${area}/*]}]} {
8 set off [string first / $area]
12 if {[file isfile $n]} {
13 if {[string first :.html $n] > 0} {
14 return http://[string range $area/ $off end]
16 return http://[string range $n $off end]
20 if {[file isdirectory $n]} {
21 set sb [RobotFileNext1 $n]
22 if {[string length $sb]} {
30 proc RobotFileWait {} {
35 proc RobotFileNext {area} {
37 if {[catch {set ns [glob ${area}/*]}]} {
40 set off [string length $area]
43 set n [lindex $ns $robotSeq]
44 if {![string length $n]} {
45 puts "------------ N E X T R O U N D --------"
47 after 2000 RobotFileWait
50 set n [lindex $ns $robotSeq]
51 if {![string length $n]} {
56 if {[file isfile $n/robots.txt]} {
57 puts "ok returning http://[string range $n $off end]/robots.txt"
58 return http://[string range $n $off end]/robots.txt
59 } elseif {[file isdirectory $n]} {
60 set sb [RobotFileNext1 $n]
61 if {[string length $sb]} {
69 proc RobotFileExist {area host path} {
70 set comp [split $area/$host$path /]
73 if {![string length [lindex $comp $l]]} {
74 set comp [split $area/$host$path:.html /]
75 } elseif {[file exists [join $comp /]]} {
78 set comp [split $area/$host$path/:.html /]
80 return [file exists [join $comp /]]
83 proc RobotFileUnlink {area host path} {
84 set comp [split $area/$host$path /]
87 if {![string length [lindex $comp $l]]} {
88 set comp [split $area/$host$path:.html /]
90 if {[catch {exec rm [join $comp /]}]} return
92 for {set i $l} {$i > 0} {incr i -1} {
93 set path [join [lrange $comp 0 $i] /]
94 if {![catch {glob $path/*}]} return
99 proc RobotFileClose {out} {
100 if [string compare $out stdout] {
105 proc RobotFileOpen {area host path {mode w}} {
109 if {![info exists workdir]} {
112 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path"
113 if {[string compare $orgPwd $workdir]} {
114 puts "workdir = $workdir"
118 set comp [split $area/$host$path /]
119 set len [llength $comp]
121 for {set i 0} {$i < $len} {incr i} {
122 set d [lindex $comp $i]
123 if {[catch {cd ./$d}]} {
126 if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
127 set out [open robots.txt w]
128 puts "creating robots.txt in $d"
133 set d [lindex $comp $len]
134 if {[string length $d]} {
135 if {[file isdirectory $d]} {
136 set out [open $d/:.html $mode]
138 set out [open $d $mode]
141 set out [open :.html $mode]
144 #puts "RobotFileStop"
148 proc RobotRestart {} {
153 set url [RobotFileNext unvisited]
154 if {![string length $url]} {
157 set r [RobotGetUrl $url {}]
159 puts "RobotGetUrl returned 0 on url=$url"
162 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
168 proc headSave {url out} {
172 if {[info exists URL($url,head,last-modified)]} {
173 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
176 if {[info exists URL($url,head,date)]} {
177 puts $out " <date>$URL($url,head,date)</date>"
179 if {[info exists URL($url,head,content-length)]} {
180 puts $out " <by>$URL($url,head,content-length)</by>"
182 if {[info exists URL($url,head,server)]} {
183 puts $out " <format>$URL($url,head,server)</format>"
186 puts $out {<publisher>}
187 puts $out " <identifier>$url</identifier>"
188 if {[info exists URL($url,head,content-type)]} {
189 puts $out " <type>$URL($url,head,content-type)</type>"
191 puts $out {</publisher>}
194 proc RobotHref {url hrefx hostx pathx} {
200 puts "Ref url = $url href=$href"
201 # get method (if any)
202 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
206 if {[string compare $method http]} {
211 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
212 if {![string length $surl]} {
216 foreach domain $domains {
217 if {[string match $domain $host]} {
226 regexp {^([^\#]*)} $hpath x surl
227 set host $URL($url,host)
229 if {![string length $surl]} {
232 if {[string first / $surl]} {
234 regexp {^([^\#?]*)} $URL($url,path) x dpart
235 set l [string last / $dpart]
236 if {[expr $l >= 0]} {
237 set surl [string range $dpart 0 $l]$surl
239 set surl $dpart/$surl
242 set c [split $surl /]
245 set path [lindex $c $i]
248 switch -- [lindex $c $i] {
256 set path [lindex $c $i]/$path
261 set href "$method://$host$path"
262 puts "Ref href = $href"
266 proc Robot401 {url} {
272 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
273 set fromurl [gets $inf]
276 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
277 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
278 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
279 puts $outf "URL=$url 401"
280 puts $outf "Reference $fromurl"
285 proc Robot404 {url} {
291 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
292 set fromurl [gets $inf]
295 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
296 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
297 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
298 puts $outf "URL=$url 404"
299 puts $outf "Reference $fromurl"
304 proc Robot301 {url tourl} {
307 puts "Redirecting from $url to $tourl"
311 set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
312 set fromurl [gets $inf]
315 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
316 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
317 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
318 puts $outf "URL=$url to $tourl 301"
319 puts $outf "Reference $fromurl"
322 if {[RobotHref $url tourl host path]} {
323 if {![RobotFileExist unvisited $host $path]} {
324 puts "Mark as unvisited"
325 set outf [RobotFileOpen unvisited $host $path]
332 proc RobotTextHtml {url out} {
336 htmlSwitch $URL($url,buf) \
342 puts $out "<title>$body</title>"
348 puts -nonewline $out "<meta"
349 foreach a [array names parm] {
350 puts -nonewline $out " $a"
351 puts -nonewline $out {="}
352 puts -nonewline $out $parm($a)
353 puts -nonewline $out {"}
357 regsub -all -nocase {<script.*</script>} $body {} abody
358 regsub -all {<[^\>]+>} $abody {} nbody
359 puts $out "<documentcontent>"
361 puts $out "</documentcontent>"
363 if {![info exists parm(href)]} {
373 if {![RobotHref $url href host path]} continue
376 puts $out "<identifier>$href</identifier>"
377 puts $out "<description>$body</description>"
380 if {![RobotFileExist visited $host $path]} {
381 if {![RobotFileExist bad $host $path]} {
382 if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} {
383 puts "--- Error $msg"
399 proc RobotTextPlain {url out} {
403 puts $out "<documentcontent>"
404 puts $out $URL($url,buf)
405 puts $out "</documentcontent>"
409 proc Robot200 {url} {
413 set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
414 switch $URL($url,head,content-type) {
416 RobotTextHtml $url $out
419 RobotTextPlain $url $out
427 # puts "Parsing done"
428 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
431 proc RobotReadContent {url sock} {
434 set buffer [read $sock 16384]
435 set readCount [string length $buffer]
437 if {$readCount <= 0} {
442 # puts "Got $readCount bytes"
443 set URL($url,buf) $URL($url,buf)$buffer
447 proc RobotReadHeader {url sock} {
450 set buffer [read $sock 2148]
451 set readCount [string length $buffer]
453 if {$readCount <= 0} {
458 # puts "Got $readCount bytes"
459 set URL($url,buf) $URL($url,buf)$buffer
461 set n [string first \n\n $URL($url,buf)]
465 set headbuf [string range $URL($url,buf) 0 $n]
468 set URL($url,buf) [string range $URL($url,buf) $n end]
470 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
471 set lines [split $headbuf \n]
472 foreach line $lines {
473 if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
474 set URL($url,head,[string tolower $name]) $value
478 set URL($url,state) skip
481 Robot301 $url $URL($url,head,location)
486 Robot301 $url $URL($url,head,location)
501 if {![info exists URL($url,head,content-type)]} {
502 set URL($url,head,content-type) {}
504 switch $URL($url,head,content-type) {
506 fileevent $sock readable [list RobotReadContent $url $sock]
509 fileevent $sock readable [list RobotReadContent $url $sock]
528 proc RobotConnect {url sock} {
531 fconfigure $sock -translation {auto crlf} -blocking 0
533 fileevent $sock readable [list RobotReadHeader $url $sock]
534 puts $sock "GET $URL($url,path) HTTP/1.0"
535 puts $sock "Host: $URL($url,host)"
536 puts $sock "User-Agent: $agent"
545 proc RobotGetUrl {url phost} {
549 if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
552 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
556 set URL($url,method) $method
557 set URL($url,host) $host
558 set URL($url,port) $port
559 set URL($url,path) $path
560 set URL($url,state) head
562 if [catch {set sock [socket -async $host $port]}] {
565 RobotConnect $url $sock
570 if {![llength [info commands htmlSwitch]]} {
571 set e [info sharedlibextension]
572 if {[catch {load ./tclrobot$e}]} {
578 set agent "zmbot/0.0"
579 if {![catch {set os [exec uname -s -r]}]} {
580 set agent "$agent ($os)"
592 if {[llength $argv] < 2} {
593 puts "Tclrobot: usage <domain> <start>"
594 puts " Example: '*.indexdata.dk' http://www.indexdata.dk/"
598 set domains [lindex $argv 0]
599 set site [lindex $argv 1]
600 if {[string length $site]} {
602 if [RobotGetUrl $site {}] {
604 puts "Couldn't process $site"
608 while {$robotMoreWork} {