2 # $Id: robot.tcl,v 1.30 2002/02/17 09:29:18 adam Exp $
4 proc RobotFileNext1 {area lead} {
5 # puts "RobotFileNext1 area=$area lead=$lead"
6 if {[catch {set ns [glob ${area}/*]}]} {
10 if {[file isfile $n]} {
11 set off [string last / $n]
13 return $lead/[string range $n $off end]
17 if {[file isdirectory $n]} {
18 set off [string last / $n]
20 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
21 if {[string length $sb]} {
29 proc RobotWriteRecord {outf fromurl distance} {
31 puts $outf "<distance>"
33 puts $outf "</distance>"
34 puts $outf "<fromurl>"
36 puts $outf "</fromurl>"
40 proc RobotReadRecord {inf fromurlx distancex} {
41 upvar $fromurlx fromurl
42 upvar $distancex distance
45 set distance [string trim [gets $inf]]
46 # puts "got distance = $distance"
49 set fromurl [string trim [gets $inf]]
52 proc RobotFileNext {area} {
57 # puts "RobotFileNext robotSeq=$robotSeq"
62 if {[catch {set ns [glob ${area}/*]}]} {
66 set off [string length $area]
68 set n [lindex $ns $robotSeq]
69 if {![string length $n]} {
72 set statusfile [open status w]
73 puts $statusfile "$status(unvisited) $status(bad) $status(visited)"
78 if {[file isfile $n/frobots.txt]} {
79 puts "ok returning http://[string range $n $off end]/robots.txt"
80 return http://[string range $n $off end]/robots.txt
81 } elseif {[file isdirectory $n]} {
82 set sb [RobotFileNext1 $n http://[string range $n $off end]]
83 if {[string length $sb]} {
87 puts "no more work at end of RobotFileNext n=$n"
93 proc RobotFileExist {area host path} {
96 if {$debuglevel > 3} {
97 puts "RobotFileExist begin area=$area host=$host path=$path"
99 set lpath [split $path /]
100 set l [llength $lpath]
102 set t [lindex $lpath $l]
104 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
105 if {$debuglevel > 3} {
106 puts "RobotFileExist end npath=$npath"
108 return [file exists $npath]
111 proc RobotFileUnlink {area host path} {
113 # puts "RobotFileUnlink begin"
114 # puts "area=$area host=$host path=$path"
115 set lpath [split $path /]
116 set l [llength $lpath]
118 set t [lindex $lpath $l]
120 set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
121 # puts "npath=$npath"
122 set comp [split $npath /]
123 if {[catch {exec rm [join $comp /]}]} return
125 set l [llength $comp]
128 incr status($area) -1
129 for {set i $l} {$i > 0} {incr i -1} {
130 set path [join [lrange $comp 0 $i] /]
131 if {![catch {glob $path/*}]} return
134 # puts "RobotFileUnlink end"
137 proc RobotFileClose {out} {
138 if [string compare $out stdout] {
143 proc RobotFileOpen {area host path {mode w}} {
149 if {![info exists workdir]} {
152 if {$debuglevel > 3} {
153 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
155 if {[string compare $orgPwd $workdir]} {
156 puts "ooops. RobotFileOpen failed"
157 puts "workdir = $workdir"
161 set comp [split $area/$host$path /]
162 set len [llength $comp]
164 for {set i 0} {$i < $len} {incr i} {
166 set d "d[lindex $comp $i]"
168 set d [lindex $comp $i]
170 if {[catch {cd ./$d}]} {
173 if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
174 set out [open frobots.txt w]
175 puts "creating robots.txt in $d"
177 incr status(unvisited)
181 set d [lindex $comp $len]
182 if {[string length $d]} {
183 set out [open f$d $mode]
185 if {[file isfile $d/f]} {
186 set out [open $d/f $mode]
188 set out [open f$d $mode]
192 set out [open f $mode]
202 global robotSeq robotsRunning
204 incr robotsRunning -1
205 while {$robotsRunning} {
212 proc RobotRestart {url sock} {
213 global URL robotsRunning
216 after cancel $URL($sock,cancel)
218 foreach v [array names URL $url,*] {
222 incr robotsRunning -1
228 global robotsRunning robotsMax idletime
232 set url [RobotFileNext unvisited]
233 if {![string length $url]} {
237 if {[string compare $url wait] == 0} {
238 after $idletime RobotRR
241 set r [RobotGetUrl $url {}]
243 if {$robotsRunning >= $robotsMax} return
245 incr robotsRunning -1
246 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
247 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
250 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
255 proc headSave {url out} {
258 if {[info exists URL($url,head,last-modified)]} {
259 puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
262 if {[info exists URL($url,head,date)]} {
263 puts $out " <date>$URL($url,head,date)</date>"
265 if {[info exists URL($url,head,content-length)]} {
266 puts $out " <by>$URL($url,head,content-length)</by>"
268 if {[info exists URL($url,head,server)]} {
269 puts $out " <format>$URL($url,head,server)</format>"
272 puts $out {<publisher>}
273 puts $out " <identifier>$url</identifier>"
274 if {[info exists URL($url,head,content-type)]} {
275 puts $out " <type>$URL($url,head,content-type)</type>"
277 puts $out {</publisher>}
280 proc RobotHref {url hrefx hostx pathx} {
281 global URL domains debuglevel
286 if {$debuglevel > 1} {
287 puts "Ref input url = $url href=$href"
290 if {[string first { } $href] >= 0} {
293 if {[string length $href] > 256} {
296 if {[string first {?} $href] >= 0} {
299 if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
302 # get method (if any)
303 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
307 if {[string compare $method http]} {
312 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
313 if {![string length $surl]} {
316 if {[info exist domains]} {
318 foreach domain $domains {
319 if {[string match $domain $host]} {
329 regexp {^([^\#]*)} $hpath x surl
330 set host $URL($url,hostport)
332 if {![string length $surl]} {
335 if {[string first / $surl]} {
337 set curpath $URL($url,path)
338 if {[info exists URL($url,bpath)]} {
339 set curpath $URL($url,bpath)
341 regexp {^([^\#?]*)} $curpath x dpart
342 set l [string last / $dpart]
343 if {[expr $l >= 0]} {
344 set surl [string range $dpart 0 $l]$surl
346 set surl $dpart/$surl
349 set surllist [split $surl /]
352 foreach c $surllist {
357 set path [lrange $path 0 $pathl]
370 if {$debuglevel > 4} {
371 puts "pathl=$pathl output path=$path"
373 set path [join $path /]
374 if {![string length $path]} {
377 regsub -all {~} $path {%7E} path
378 set href "$method://$host$path"
380 if {$debuglevel > 1} {
381 puts "Ref result = $href"
383 return [checkrule url $href]
386 proc RobotError {url code} {
389 puts "Bad URL $url (code $code)"
392 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
393 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
394 RobotReadRecord $inf fromurl distance
397 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
398 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
399 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
400 RobotWriteRecord $outf $fromurl $distance
405 proc RobotRedirect {url tourl code} {
408 puts "Redirecting from $url to $tourl"
412 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
413 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
414 RobotReadRecord $inf fromurl distance
417 if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
418 set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
419 RobotWriteRecord $outf $fromurl $distance
422 if {[RobotHref $url tourl host path]} {
423 if {![RobotFileExist visited $host $path]} {
424 if {![RobotFileExist unvisited $host $path]} {
425 set outf [RobotFileOpen unvisited $host $path]
426 RobotWriteRecord $outf $fromurl $distance
431 set inf [RobotFileOpen visited $host $path r]
432 RobotReadRecord $inf oldurl olddistance
434 if {[string length $olddistance] == 0} {
437 if {[string length $distance] == 0} {
440 puts "distance=$distance olddistance=$olddistance"
441 if {[expr $distance < $olddistance]} {
442 set outf [RobotFileOpen unvisited $host $path]
443 RobotWriteRecord $outf $tourl $distance
448 if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} {
454 proc link {url out href body distance} {
455 global URL maxdistance
456 if {[expr $distance > $maxdistance]} return
458 if {![RobotHref $url href host path]} return
461 puts $out "<identifier>$href</identifier>"
462 puts $out "<description>$body</description>"
465 if {![RobotFileExist visited $host $path]} {
467 if {![RobotFileExist bad $host $path]} {
468 if {[RobotFileExist unvisited $host $path]} {
469 set inf [RobotFileOpen unvisited $host $path r]
470 RobotReadRecord $inf oldurl olddistance
476 if {[string length $olddistance] == 0} {
479 if {[expr $distance < $olddistance]} {
480 set outf [RobotFileOpen unvisited $host $path]
481 RobotWriteRecord $outf $url $distance
484 } elseif {[string compare $href $url]} {
485 set inf [RobotFileOpen visited $host $path r]
486 RobotReadRecord $inf xurl olddistance
488 if {[string length $olddistance] == 0} {
491 if {[expr $distance < $olddistance]} {
492 puts "OK remarking url=$url href=$href"
493 puts "olddistance = $olddistance"
494 puts "newdistance = $distance"
495 set outf [RobotFileOpen unvisited $host $path]
496 RobotWriteRecord $outf $url $distance
502 proc RobotTextHtml {url out} {
503 global URL maxdistance
505 # set title so we can emit it for the body
507 # if true, nothing will be indexed
509 # if true, nothing will be followed
514 if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
515 set fdistance $URL($url,dist)
516 set distance [expr $fdistance + 1]
518 htmlSwitch $URL($url,buf) \
522 # collect metadata and save NAME= CONTENT=..
525 puts -nonewline $out "<meta"
526 foreach a [array names parm] {
527 set al [string tolower $a]
528 puts -nonewline $out " $al"
529 puts -nonewline $out {="}
530 puts -nonewline $out $parm($a)
531 puts -nonewline $out {"}
534 set metaname [string tolower $parm($a)]
537 set metacontent $parm($a)
542 # go through robots directives (af any)
543 if {![string compare $metaname robots]} {
544 set direcs [split [string tolower $metacontent] ,]
545 if {[lsearch $direcs noindex] >= 0} {
548 if {[lsearch $direcs nofollow] >= 0} {
553 # don't print title of document content if noindex is used
555 puts $out "<title>$title</title>"
556 regsub -all {<!--[^-]*->} $body { } abody
557 regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
558 regsub -all {<[^\>]+>} $bbody {} nbody
559 puts $out "<documentcontent>"
561 puts $out "</documentcontent>"
565 if {![info exists parm(href)]} {
568 set href [string trim $parm(href)]
569 if {![RobotHref $url href host path]} continue
570 set URL($url,bpath) $path
572 # <a href="...."> .. </a>
573 # we're not using nonest - otherwise body isn't set
574 if {$nofollow} continue
575 if {![info exists parm(href)]} {
578 link $url $out [string trim $parm(href)] $body $distance
580 if {$nofollow} continue
581 if {![info exists parm(href)]} {
584 link $url $out [string trim $parm(href)] $body $distance
586 if {![info exists parm(src)]} {
589 link $url $out [string trim $parm(src)] $body $fdistance
593 proc RobotsTxt {url} {
596 RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf)
599 proc RobotsTxt0 {v buf} {
602 foreach l [split $buf \n] {
603 if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
604 puts "cmd=$cmd arg=$arg"
605 switch -- [string tolower $cmd] {
608 set pat [string tolower $arg]*
609 set section [string match $pat $agent]
613 puts "rule [list 0 $arg]"
614 lappend $v [list 0 $arg]
619 puts "rule [list 1 $arg]"
620 lappend $v [list 1 $arg]
628 proc RobotTextPlain {url out} {
631 puts $out "<documentcontent>"
632 regsub -all {<} $URL($url,buf) {\<} content
634 puts $out "</documentcontent>"
636 if {![string compare $URL($url,path) /robots.txt]} {
641 proc RobotWriteMetadata {url out} {
647 if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
648 set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
649 RobotReadRecord $inf fromurl distance
652 set URL($url,dist) $distance
653 puts $out "<distance>"
654 puts $out " $distance"
655 puts $out "</distance>"
657 puts "Parsing $url distance=$distance"
658 switch $URL($url,head,content-type) {
660 if {[string length $distance]} {
661 RobotTextHtml $url $out
665 RobotTextPlain $url $out
671 proc Robot200 {url} {
674 set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)]
675 puts -nonewline $out $URL($url,buf)
678 set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
679 RobotWriteMetadata $url $out
682 RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
685 proc RobotReadContent {url sock binary} {
688 set buffer [read $sock 16384]
689 set readCount [string length $buffer]
691 if {$readCount <= 0} {
693 RobotRestart $url $sock
694 } elseif {!$binary && [string first \0 $buffer] >= 0} {
696 RobotRestart $url $sock
698 # puts "Got $readCount bytes"
699 set URL($url,buf) $URL($url,buf)$buffer
703 proc RobotReadHeader {url sock} {
704 global URL debuglevel
706 if {$debuglevel > 1} {
707 puts "HTTP head $url"
709 if {[catch {set buffer [read $sock 2148]}]} {
711 RobotRestart $url $sock
714 set readCount [string length $buffer]
716 if {$readCount <= 0} {
718 RobotRestart $url $sock
720 # puts "Got $readCount bytes"
721 set URL($url,buf) $URL($url,buf)$buffer
723 set n [string first \r\n\r\n $URL($url,buf)]
727 set headbuf [string range $URL($url,buf) 0 $n]
729 set URL($url,buf) [string range $URL($url,buf) $n end]
731 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
732 set lines [split $headbuf \n]
733 foreach line $lines {
734 if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
735 set URL($url,head,[string tolower $name]) [string trim $value]
738 puts "HTTP CODE $code"
739 set URL($url,state) skip
742 RobotRedirect $url $URL($url,head,location) 301
743 RobotRestart $url $sock
746 RobotRedirect $url $URL($url,head,location) 302
747 RobotRestart $url $sock
750 if {![info exists URL($url,head,content-type)]} {
751 set URL($url,head,content-type) {}
754 switch -glob -- $URL($url,head,content-type) {
759 if {![regexp {/robots.txt$} $url]} {
760 if {![checkrule mime $URL($url,head,content-type)]} {
761 RobotError $url mimedeny
762 RobotRestart $url $sock
766 fileevent $sock readable [list RobotReadContent $url $sock $binary]
769 RobotError $url $code
770 RobotRestart $url $sock
777 proc RobotSockCancel {url sock} {
779 puts "RobotSockCancel sock=$sock url=$url"
781 RobotRestart $url $sock
784 proc RobotConnect {url sock} {
785 global URL agent acceptLanguage
787 fconfigure $sock -translation {lf crlf} -blocking 0
788 fileevent $sock readable [list RobotReadHeader $url $sock]
789 puts $sock "GET $URL($url,path) HTTP/1.0"
790 puts $sock "Host: $URL($url,host)"
791 puts $sock "User-Agent: $agent"
792 if {[string length $acceptLanguage]} {
793 puts $sock "Accept-Language: $acceptLanguage"
797 set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]]
804 proc RobotGetUrl {url phost} {
805 global URL robotsRunning
807 puts "Retrieve $robotsRunning url=$url"
808 if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
811 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
815 set URL($url,method) $method
816 set URL($url,host) $host
817 set URL($url,hostport) $hostport
818 set URL($url,path) $path
819 set URL($url,state) head
822 if {[string compare $path /robots.txt]} {
824 if {![info exists URL($hostport,robots)]} {
825 puts "READING robots.txt for host $hostport"
826 if {[RobotFileExist visited $hostport /robots.txt]} {
827 set inf [RobotFileOpen visited $hostport /robots.txt r]
828 set buf [read $inf 32768]
831 set buf "User-agent: *\nAllow: /\n"
833 RobotsTxt0 URL($hostport,robots) $buf
835 if {[info exists URL($hostport,robots)]} {
836 foreach l $URL($hostport,robots) {
837 if {[string first [lindex $l 1] $path] == 0} {
844 puts "skipped due to robots.txt"
848 if [catch {set sock [socket -async $host $port]}] {
851 RobotConnect $url $sock
856 if {![llength [info commands htmlSwitch]]} {
857 set e [info sharedlibextension]
858 if {[catch {load ./tclrobot$e}]} {
863 set agent "zmbot/0.1"
864 if {![catch {set os [exec uname -s -r]}]} {
865 set agent "$agent ($os)"
880 set acceptLanguage {}
882 set status(unvisited) 0
883 set status(visited) 0
888 # Rules: allow, deny, url
890 proc checkrule {type this} {
894 if {$debuglevel > 3} {
895 puts "CHECKRULE $type $this"
897 if {[info exist alrules]} {
899 if {$debuglevel > 3} {
903 if {[lindex $l 1] != $type} continue
904 # consider mask (! negates)
905 set masks [lindex $l 2]
907 foreach mask $masks {
908 if {$debuglevel > 4} {
909 puts "consider single mask $mask"
911 if {[string index $mask 0] == "!"} {
912 set mask [string range $mask 1 end]
913 if {[string match $mask $this]} continue
915 if {![string match $mask $this]} continue
919 if {$debuglevel > 4} {
923 # OK, we have a match
924 if {[lindex $l 0] == "allow"} {
925 if {$debuglevel > 3} {
926 puts "CHECKRULE MATCH OK"
930 if {$debuglevel > 3} {
931 puts "CHECKFULE MATCH FAIL"
937 if {$debuglevel > 3} {
938 puts "CHECKRULE MATCH OK"
947 if {[RobotHref http://www.indexdata.dk/ href host path]} {
948 if {![RobotFileExist visited $host $path]} {
949 set outf [RobotFileOpen unvisited $host $path]
950 RobotWriteRecord $outf href 0
956 proc deny {type stuff} {
959 lappend alrules [list deny $type $stuff]
962 proc allow {type stuff} {
965 lappend alrules [list allow $type $stuff]
971 set debuglevel $level
977 set l [llength $argv]
980 puts {tclrobot: usage:}
981 puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-r rules] [url ..]}
982 puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
987 set arg [lindex $argv $i]
988 switch -glob -- $arg {
990 set robotsMax [string range $arg 2 end]
991 if {![string length $robotsMax]} {
992 set robotsMax [lindex $argv [incr i]]
996 set maxdistance [string range $arg 2 end]
997 if {![string length $maxdistance]} {
998 set maxdistance [lindex $argv [incr i]]
1002 set dom [string range $arg 2 end]
1003 if {![string length $dom]} {
1004 set dom [lindex $argv [incr i]]
1006 lappend domains $dom
1009 set idletime [string range $arg 2 end]
1010 if {![string length $idletime]} {
1011 set idletime [lindex $argv [incr i]]
1015 set acceptLanguage [string range $arg 2 end]
1016 if {![string length $acceptLanguage]} {
1017 set acceptLanguage [lindex $argv [incr i]]
1021 set rfile [string range $arg 2 end]
1022 if {![string length $rfile]} {
1023 set rfile [lindex $argv [incr i]]
1029 if {[RobotHref http://www.indexdata.dk/ href host path]} {
1030 if {![RobotFileExist visited $host $path]} {
1031 set outf [RobotFileOpen unvisited $host $path]
1032 RobotWriteRecord $outf href 0
1033 RobotFileClose $outf
1041 if {![info exist domains]} {
1044 if {![info exist maxdistance]} {
1047 if {![info exist robotsMax]} {
1051 puts "domains=$domains"
1052 puts "max distance=$maxdistance"
1053 puts "max jobs=$robotsMax"
1059 while {$robotsRunning} {
1063 set statusfile [open status w]
1064 puts $statusfile "$status(unvisited) $status(bad) $status(visited)"