2 # $Id: robot.tcl,v 1.4 1999/02/04 20:37:25 perhans Exp $
4 proc RobotFileNext {area} {
5 if {[catch {set ns [glob ${area}/*]}]} {
8 set off [string first / $area]
11 if {[file isfile $n]} {
12 if {[string first :.html $n] > 0} {
13 return http://[string range $area/ $off end]
15 return http://[string range $n $off end]
17 if {[file isdirectory $n]} {
18 set sb [RobotFileNext $n]
19 if {[string length $sb]} {
27 proc RobotFileExist {area host path} {
28 set comp [split $area/$host$path /]
31 if {![string length [lindex $comp $l]]} {
32 set comp [split $area/$host$path:.html /]
34 return [file exists [join $comp /]]
37 proc RobotFileUnlink {area host path} {
38 set comp [split $area/$host$path /]
41 if {![string length [lindex $comp $l]]} {
42 set comp [split $area/$host$path:.html /]
44 if {[catch {exec rm [join $comp /]}]} return
46 for {set i $l} {$i > 0} {incr i -1} {
47 set path [join [lrange $comp 0 $i] /]
48 if {![catch {glob $path/*}]} return
53 proc RobotFileOpen {area host path} {
56 set comp [split $area/$host$path /]
57 set len [llength $comp]
59 for {set i 0} {$i < $len} {incr i} {
60 set d [lindex $comp $i]
61 if {[catch {cd ./$d}]} {
66 set d [lindex $comp $len]
67 if {[string length $d]} {
70 set out [open :.html w]
76 proc RobotRestart {} {
80 set url [RobotFileNext unvisited]
81 if {![string length $url]} break
82 set r [RobotGetUrl $url {}]
86 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
92 proc headSave {url out title} {
96 puts $out "<title>$title</title>"
97 if {[info exists URL($url,head,last-modified)]} {
98 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
101 if {[info exists URL($url,head,date)]} {
102 puts $out " <date>$URL($url,head,date)</date>"
104 if {[info exists URL($url,head,content-length)]} {
105 puts $out " <by>$URL($url,head,content-length)</by>"
107 if {[info exists URL($url,head,server)]} {
108 puts $out " <format>$URL($url,head,server)</format>"
111 puts $out {<publisher>}
112 puts $out " <identifier>$url</identifier>"
113 if {[info exists URL($url,head,content-type)]} {
114 puts $out " <type>$URL($url,head,content-type)</type>"
116 puts $out {</publisher>}
119 proc RobotSave {url} {
122 set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
124 if {[info exists URL($url,line)]} {
125 set htmlContent [join $URL($url,line) \n]
127 htmlSwitch $htmlContent \
130 headSave $url $out $body
134 regsub -all -nocase {<script.*</script>} $body {} abody
135 regsub -all {<[^\>]+>} $abody {} nbody
136 puts $out "<documentcontent>"
138 puts $out "</documentcontent>"
140 if {![info exists parm(href)]} {
145 headSave $url $out "untitled"
149 if {[regexp {^\#} $parm(href)]} {
151 } elseif {[regexp {^([^:]+):([^#]+)} $parm(href) x method hpath]} {
153 if {![string compare $method http]} {
154 if {![regexp {^//([^/]+)(.*)} $hpath x host path]} {
155 set host $URL($url,host)
158 foreach domain $domains {
159 if {[string match $domain $host]} {
166 } elseif {[regexp {^([/~][^#]*)} $parm(href) x path]} {
167 set host $URL($url,host)
170 set ext [file extension $URL($url,path)]
171 if {[string compare $ext {}]} {
172 set dpart [file dirname $URL($url,path)]
174 set dpart $URL($url,path)
176 regexp {^([^#]+)} $parm(href) x path
177 set host $URL($url,host)
178 set path [string trimright $dpart /]/$path
181 set ext [file extension $path]
182 if {![string length $ext]} {
183 set path [string trimright $path /]/
185 set path [string trimright $path /]
187 set c [split $path /]
190 set path [lindex $c $i]
193 switch -- [lindex $c $i] {
201 set path [lindex $c $i]/$path
206 set href "$method://$host$path"
209 puts $out "<identifier>$href</identifier>"
210 puts $out "<description>$body</description>"
213 if {![regexp {/.*bin/} $href)]} {
214 if {![RobotFileExist visited $host $path]} {
215 set outf [RobotFileOpen unvisited $host $path]
222 headSave $url $out "untitled"
227 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
230 proc RobotRead {url sock} {
233 set readCount [gets $sock line]
234 if {$readCount < 0} {
240 } elseif {$readCount > 0} {
241 switch $URL($url,state) {
244 if {[regexp {([^:]+):[ ]+(.*)} $line x name value]} {
245 set URL($url,head,[string tolower $name]) $value
249 lappend URL($url,line) $line
258 set URL($url,state) html
259 if {[info exists URL($url,head,content-type)]} {
260 if {![string compare $URL($url,head,content-type) text/html]} {
261 set URL($url,state) html
267 proc RobotConnect {url sock} {
270 fileevent $sock readable [list RobotRead $url $sock]
271 puts $sock "GET $URL($url,path) HTTP/1.0"
280 proc RobotGetUrl {url phost} {
285 if {[regexp {([^:]+)://([^/]+)([^ ]*)} $url x method host path]} {
286 puts "method=$method host=$host path=$path"
290 set URL($url,method) $method
291 set URL($url,host) $host
292 set URL($url,port) $port
293 set URL($url,path) $path
294 set URL($url,state) head
295 if [catch {set sock [socket -async $host $port]}] {
298 fconfigure $sock -translation {auto crlf}
299 RobotConnect $url $sock
304 if {![llength [info commands htmlSwitch]]} {
305 set e [info sharedlibextension]
306 if {[catch {load ./tclrobot$e}]} {
311 if {[llength $argv] < 2} {
312 puts "Tclrobot: usage <domain> <start>"
313 puts " Example: '*.dk' www.indexdata.dk"
316 set domains [lindex $argv 0]
317 set site [lindex $argv 1]
318 if {[string length $site]} {
319 set x [RobotFileOpen unvisited $site /]