2 # $Id: robot.tcl,v 1.5 1999/12/27 11:49:31 adam Exp $
4 proc RobotFileNext {area} {
5 if {[catch {set ns [glob ${area}/*]}]} {
8 set off [string first / $area]
12 if {[file isfile $n]} {
13 if {[string first :.html $n] > 0} {
14 return http://[string range $area/ $off end]
16 return http://[string range $n $off end]
18 if {[file isdirectory $n]} {
19 set sb [RobotFileNext $n]
20 if {[string length $sb]} {
28 proc RobotFileExist {area host path} {
29 set comp [split $area/$host$path /]
32 if {![string length [lindex $comp $l]]} {
33 set comp [split $area/$host$path:.html /]
34 } elseif {[file exists [join $comp /]]} {
37 set comp [split $area/$host$path/:.html /]
39 return [file exists [join $comp /]]
42 proc RobotFileUnlink {area host path} {
43 set comp [split $area/$host$path /]
46 if {![string length [lindex $comp $l]]} {
47 set comp [split $area/$host$path:.html /]
49 if {[catch {exec rm [join $comp /]}]} return
51 for {set i $l} {$i > 0} {incr i -1} {
52 set path [join [lrange $comp 0 $i] /]
53 if {![catch {glob $path/*}]} return
58 proc RobotFileOpen {area host path} {
62 #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path"
63 if {[string compare $orgPwd $workdir]} {
64 puts "workdir = $workdir"
68 set comp [split $area/$host$path /]
69 set len [llength $comp]
71 for {set i 0} {$i < $len} {incr i} {
72 set d [lindex $comp $i]
73 if {[catch {cd ./$d}]} {
78 set d [lindex $comp $len]
79 if {[string length $d]} {
82 set out [open :.html w]
89 proc RobotRestart {} {
93 set url [RobotFileNext unvisited]
94 if {![string length $url]} {
95 puts "No more unvisited"
98 set r [RobotGetUrl $url {}]
100 puts "RobotGetUrl returned 0 on url=$url"
103 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
109 proc headSave {url out title} {
113 puts $out "<title>$title</title>"
114 if {[info exists URL($url,head,last-modified)]} {
115 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
118 if {[info exists URL($url,head,date)]} {
119 puts $out " <date>$URL($url,head,date)</date>"
121 if {[info exists URL($url,head,content-length)]} {
122 puts $out " <by>$URL($url,head,content-length)</by>"
124 if {[info exists URL($url,head,server)]} {
125 puts $out " <format>$URL($url,head,server)</format>"
128 puts $out {<publisher>}
129 puts $out " <identifier>$url</identifier>"
130 if {[info exists URL($url,head,content-type)]} {
131 puts $out " <type>$URL($url,head,content-type)</type>"
133 puts $out {</publisher>}
136 proc RobotHref {url hrefx hostx pathx} {
142 # puts "Ref url = $url href=$href"
143 # get method (if any)
144 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
148 if {[string compare $method http]} {
153 if {![regexp {^//([^/]+)(.*)} $hpath x host epath]} {
155 set host $URL($url,host)
157 if {![string length $epath]} {
161 foreach domain $domains {
162 if {[string match $domain $host]} {
171 if {[regexp {^(\#|\?)} $epath]} {
174 } elseif {![regexp {^([/][^\#?]*)} $epath x path]} {
176 set ext [file extension $URL($url,path)]
177 if {[string compare $ext {}]} {
178 set dpart [file dirname $URL($url,path)]
180 set dpart $URL($url,path)
182 regexp {^([^\#?]+)} $epath x path
183 set path [string trimright $dpart /]/$path
185 set c [split $path /]
188 set path [lindex $c $i]
191 switch -- [lindex $c $i] {
199 set path [lindex $c $i]/$path
204 set href "$method://$host$path"
205 # puts "Ref href = $href"
209 proc Robot401 {url} {
213 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
214 if {![RobotFileExist forbidden $URL($url,host) $URL($url,path)]} {
215 set outf [RobotFileOpen forbidden $URL($url,host) $URL($url,path)]
220 proc Robot404 {url} {
224 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
225 if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
226 set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
231 proc Robot301 {url tourl} {
234 puts "Redirecting from $url to $tourl"
235 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
236 if {[RobotHref $url tourl host path]} {
237 if {![RobotFileExist unvisited $host $path]} {
238 set outf [RobotFileOpen unvisited $host $path]
244 proc Robot200 {url} {
247 # puts "Parsing $url"
248 set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
250 if {[info exists URL($url,buf)]} {
251 set htmlContent $URL($url,buf)
253 htmlSwitch $htmlContent \
256 headSave $url $out $body
260 regsub -all -nocase {<script.*</script>} $body {} abody
261 regsub -all {<[^\>]+>} $abody {} nbody
262 puts $out "<documentcontent>"
264 puts $out "</documentcontent>"
266 if {![info exists parm(href)]} {
271 headSave $url $out "untitled"
276 if {![RobotHref $url href host path]} continue
279 puts $out "<identifier>$href</identifier>"
280 puts $out "<description>$body</description>"
283 if {![RobotFileExist visited $host $path]} {
284 if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} {
285 puts "--- Error $msg"
294 headSave $url $out "untitled"
299 # puts "Parsing done"
300 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
303 proc RobotReadBody {url sock} {
306 set buffer [read $sock 16384]
307 set readCount [string length $buffer]
309 if {$readCount <= 0} {
314 # puts "Got $readCount bytes"
315 set URL($url,buf) $URL($url,buf)$buffer
319 proc RobotReadHead {url sock} {
322 set buffer [read $sock 8192]
323 set readCount [string length $buffer]
325 if {$readCount <= 0} {
330 # puts "Got $readCount bytes"
331 set URL($url,buf) $URL($url,buf)$buffer
333 set n [string first \n\n $URL($url,buf)]
337 set headbuf [string range $URL($url,buf) 0 $n]
340 set URL($url,buf) [string range $URL($url,buf) $n end]
342 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
343 set lines [split $headbuf \n]
344 foreach line $lines {
345 if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
346 set URL($url,head,[string tolower $name]) $value
350 set URL($url,state) skip
353 Robot301 $url $URL($url,head,location)
358 Robot301 $url $URL($url,head,location)
373 if {[info exists URL($url,head,content-type)]} {
374 if {![string compare $URL($url,head,content-type) text/html]} {
375 set URL($url,state) html
378 if {[string compare $URL($url,state) html]} {
383 fileevent $sock readable [list RobotReadBody $url $sock]
396 proc RobotConnect {url sock} {
399 fconfigure $sock -translation {auto crlf} -blocking 0
401 fileevent $sock readable [list RobotReadHead $url $sock]
402 puts $sock "GET $URL($url,path) HTTP/1.0"
403 puts $sock "Host: $URL($url,host)"
412 proc RobotGetUrl {url phost} {
416 if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
419 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
423 set URL($url,method) $method
424 set URL($url,host) $host
425 set URL($url,port) $port
426 set URL($url,path) $path
427 set URL($url,state) head
429 if [catch {set sock [socket -async $host $port]}] {
432 RobotConnect $url $sock
437 if {![llength [info commands htmlSwitch]]} {
438 set e [info sharedlibextension]
439 if {[catch {load ./tclrobot$e}]} {
444 if {[llength $argv] < 2} {
445 puts "Tclrobot: usage <domain> <start>"
446 puts " Example: '*.dk' www.indexdata.dk"
452 set domains [lindex $argv 0]
453 set site [lindex $argv 1]
454 if {[string length $site]} {
455 set x [RobotFileOpen unvisited $site /]