2 # $Id: robot.tcl,v 1.42 2003/06/11 08:49:09 adam Exp $
4 proc RobotFileNext1 {area lead} {
5 # puts "RobotFileNext1 area=$area lead=$lead"
6 if {[catch {set ns [glob ${area}/*]}]} {
10 if {[file isfile $n]} {
11 set off [string last / $n]
13 return $lead/[string range $n $off end]
17 if {[file isdirectory $n]} {
18 set off [string last / $n]
20 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
21 if {[string length $sb]} {
29 proc RobotWriteRecord {outf fromurl distance} {
31 puts $outf "<distance>"
33 puts $outf "</distance>"
34 puts $outf "<fromurl>"
36 puts $outf "</fromurl>"
40 proc RobotReadRecord {inf fromurlx distancex} {
41 upvar $fromurlx fromurl
42 upvar $distancex distance
45 set distance [string trim [gets $inf]]
46 # puts "got distance = $distance"
49 set fromurl [string trim [gets $inf]]
52 proc RobotFileNext {task area} {
57 # puts "RobotFileNext seq=$control($task,seq)"
58 if {$control($task,seq) < 0} {
61 if {$control($task,seq) == 0} {
62 if {[catch {set ns($task) [glob $task/$area/*]}]} {
66 # puts "ns=$ns($task)"
67 set off [string length $task/$area]
69 set n [lindex $ns($task) $control($task,seq)]
71 if {![string length $n]} {
72 set control($task,seq) -1
74 set statusfile [open $task/status w]
75 puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
79 incr control($task,seq)
80 if {[file isfile $n/frobots.txt]} {
81 puts "ok returning http://[string range $n $off end]/robots.txt"
82 return http://[string range $n $off end]/robots.txt
83 } elseif {[file isdirectory $n]} {
84 set sb [RobotFileNext1 $n http://[string range $n $off end]]
85 if {[string length $sb]} {
89 puts "no more work at end of RobotFileNext n=$n"
95 proc RobotFileExist {task area host path} {
98 if {$debuglevel > 3} {
99 puts "RobotFileExist begin area=$area host=$host path=$path"
101 set lpath [split $path /]
102 set l [llength $lpath]
104 set t [lindex $lpath $l]
106 set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
107 if {$debuglevel > 3} {
108 puts "RobotFileExist end npath=$npath"
110 return [file exists $npath]
113 proc RobotFileUnlink {task area host path} {
115 # puts "RobotFileUnlink begin"
116 # puts "area=$area host=$host path=$path"
117 set lpath [split $path /]
118 set l [llength $lpath]
120 set t [lindex $lpath $l]
122 set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
123 # puts "npath=$npath"
124 set comp [split $npath /]
125 if {[catch {exec rm [join $comp /]}]} return
127 set l [llength $comp]
130 incr status($task,$area) -1
131 for {set i $l} {$i > 0} {incr i -1} {
132 set path [join [lrange $comp 0 $i] /]
133 if {![catch {glob $path/*}]} return
136 # puts "RobotFileUnlink end"
139 proc RobotFileClose {out} {
140 if [string compare $out stdout] {
145 proc RobotFileOpen {task area host path {mode w}} {
151 # puts "RobotFileOpen task=$task path=$path"
153 if {![info exists workdir]} {
156 if {$debuglevel > 3} {
157 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
159 if {[string compare $orgPwd $workdir]} {
160 puts "ooops. RobotFileOpen failed"
161 puts "workdir = $workdir"
166 set comp [split $task/$area/$host /]
167 set len [llength $comp]
170 # puts "1 comp=$comp"
172 for {set i 0} {$i <= $len} {incr i} {
173 set d [lindex $comp $i]
174 if {[string length $d] == 0} {
176 } elseif {[catch {cd $d}]} {
179 if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
180 if {[string compare $path /robots.txt]} {
181 set out [open frobots.txt w]
182 puts "creating robots.txt in $d"
184 incr status($task,unvisited)
190 set comp [split $path /]
191 set len [llength $comp]
194 # puts "2 path=$path comp=$comp"
196 for {set i 0} {$i < $len} {incr i} {
197 set d "d[lindex $comp $i]"
198 if {[string length $d] > 1} {
199 if {[catch {cd $d}]} {
205 set d [lindex $comp $len]
206 if {[string length $d]} {
207 set out [open f$d $mode]
209 set out [open f $mode]
212 incr status($task,$area)
219 proc RobotStartJob {fname t} {
222 set f [open $fname r]
224 puts "Reading $fname"
226 if {![regexp {<status>([^<]*)</status>} $xml x status]} {
229 if {$status == "done"} {
233 puts "status = $status"
242 set action $parm(action)
243 if {$type == "domain"} {
244 $action url http://$body/*
246 if {$type == "url"} {
249 if {$type == "mime"} {
253 set control($t,distance) $body
255 set control($t,filestatus) $body
257 if {$status == "pending"} {
258 regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
259 set f [open $fname w]
260 puts -nonewline $f $xml2
265 proc RobotDoneJob {t} {
268 if {![info exists daemon_dir]} {
274 set f [open $fname r]
276 puts "Reading $fname"
277 regexp {<status>([^<]*)</status>} $xml x status
279 puts "status = $status"
282 regsub {<status>[^<]*</status>} $xml {<status>done</status>} xml2
283 set f [open $fname w]
284 puts -nonewline $f $xml2
288 proc RobotScanDir {} {
291 if {![info exists daemon_dir]} {
294 foreach d $daemon_dir {
295 if {[catch {set files [glob $d/*.tkl]}]} {
298 foreach fname $files {
299 if {[file isfile $fname] && [file readable $fname]} {
300 set t [file rootname $fname]
301 RobotStartJob $fname $t
307 proc RobotRR {task} {
308 global control robotsRunning tasks robotsMax status
310 puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
311 incr robotsRunning -1
313 # only one task gets through...
314 if {[string compare [lindex $tasks 0] $task]} {
317 puts "RobotRR. task = $task"
318 while {$robotsRunning} {
322 if {[catch {RobotScanDir} msg]} {
323 puts "RobotScanDir failed"
327 set statusfile [open $t/status w]
328 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
330 set control($t,seq) 0
335 proc RobotDaemonSig {} {
341 proc RobotDaemonLoop {} {
342 global daemon_cnt tasks robotsRunning status
350 if {[info exists tasks]} {
351 puts "daemon loop tasks $tasks"
353 set control($t,seq) 0
356 while {$robotsRunning} {
360 after 30000 RobotDaemonSig
365 proc RobotRestart {task url sock} {
366 global URL robotsRunning
369 after cancel $URL($sock,cancel)
371 foreach v [array names URL $task,$url,*] {
375 incr robotsRunning -1
379 proc RobotStart {task} {
381 global robotsRunning robotsMax idletime status tasks
383 # puts "RobotStart $task running=$robotsRunning"
385 set url [RobotFileNext $task unvisited]
386 if {[string compare $url done] == 0} {
387 puts "In RobotStart task $task done"
391 if {[string compare $t $task]} {
397 if {![info exists ntasks]} {
406 if {![string length $url]} {
410 if {[string compare $url wait] == 0} {
411 after $idletime [list RobotRR $task]
414 set r [RobotGetUrl $task $url {}]
416 if {$robotsRunning >= $robotsMax} return
418 incr robotsRunning -1
419 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
420 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
423 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
428 proc headSave {task url out} {
431 if {[info exists URL($task,$url,head,last-modified)]} {
432 puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
435 if {[info exists URL($task,$url,head,date)]} {
436 puts $out " <date>$URL($task,$url,head,date)</date>"
438 if {[info exists URL($task,$url,head,content-length)]} {
439 puts $out " <by>$URL($task,$url,head,content-length)</by>"
441 if {[info exists URL($task,$url,head,server)]} {
442 puts $out " <format>$URL($task,$url,head,server)</format>"
445 puts $out {<publisher>}
446 puts $out " <identifier>$url</identifier>"
447 if {[info exists URL($task,$url,head,content-type)]} {
448 puts $out " <type>$URL($task,$url,head,content-type)</type>"
450 puts $out {</publisher>}
453 proc RobotHref {task url hrefx hostx pathx} {
454 global URL control debuglevel
459 if {$debuglevel > 1} {
460 puts "Ref input url = $url href=$href"
463 if {[string first { } $href] >= 0} {
466 if {[string length $href] > 256} {
470 # Skip pages that have ? in them
471 # if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
474 # get method (if any)
475 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
479 if {[string compare $method http]} {
484 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
485 if {![string length $surl]} {
488 if {[info exist control($task,domains)]} {
490 foreach domain $control($task,domains) {
491 if {[string match $domain $host]} {
501 regexp {^([^\#]*)} $hpath x surl
502 set host $URL($task,$url,hostport)
504 if {![string length $surl]} {
507 if {[string first / $surl]} {
509 set curpath $URL($task,$url,path)
510 if {[info exists URL($task,$url,bpath)]} {
511 set curpath $URL($task,$url,bpath)
513 regexp {^([^\#?]*)} $curpath x dpart
514 set l [string last / $dpart]
515 if {[expr $l >= 0]} {
516 set surl [string range $dpart 0 $l]$surl
518 set surl $dpart/$surl
521 set surllist [split $surl /]
524 foreach c $surllist {
529 set path [lrange $path 0 $pathl]
542 if {$debuglevel > 4} {
543 puts "pathl=$pathl output path=$path"
545 set path [join $path /]
546 if {![string length $path]} {
549 regsub -all {~} $path {%7E} path
550 set href "$method://$host$path"
552 if {$debuglevel > 1} {
553 puts "Ref result = $href"
555 return [checkrule $task url $href]
558 proc RobotError {task url code} {
561 puts "Bad URL $url (code $code)"
564 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
565 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
566 RobotReadRecord $inf fromurl distance
569 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
570 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
571 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
572 RobotWriteRecord $outf $fromurl $distance
577 proc RobotRedirect {task url tourl code} {
580 puts "Redirecting from $url to $tourl"
584 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
585 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
586 RobotReadRecord $inf fromurl distance
589 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
590 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
591 RobotWriteRecord $outf $fromurl $distance
594 if {[RobotHref $task $url tourl host path]} {
595 if {![RobotFileExist $task visited $host $path]} {
596 if {![RobotFileExist $task unvisited $host $path]} {
597 set outf [RobotFileOpen $task unvisited $host $path]
598 RobotWriteRecord $outf $fromurl $distance
603 set inf [RobotFileOpen $task visited $host $path r]
604 RobotReadRecord $inf oldurl olddistance
606 if {[string length $olddistance] == 0} {
609 if {[string length $distance] == 0} {
612 puts "distance=$distance olddistance=$olddistance"
613 if {[expr $distance < $olddistance]} {
614 set outf [RobotFileOpen $task unvisited $host $path]
615 RobotWriteRecord $outf $tourl $distance
620 if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
626 proc link {task url out href body distance} {
628 if {[expr $distance > $control($task,distance)]} return
630 if {![RobotHref $task $url href host path]} return
633 puts $out "<identifier>$href</identifier>"
634 puts $out "<description>$body</description>"
637 if {![RobotFileExist $task visited $host $path]} {
639 if {![RobotFileExist $task bad $host $path]} {
640 if {[RobotFileExist $task unvisited $host $path]} {
641 set inf [RobotFileOpen $task unvisited $host $path r]
642 RobotReadRecord $inf oldurl olddistance
648 if {[string length $olddistance] == 0} {
651 if {[expr $distance < $olddistance]} {
652 set outf [RobotFileOpen $task unvisited $host $path]
653 RobotWriteRecord $outf $url $distance
656 } elseif {[string compare $href $url]} {
657 set inf [RobotFileOpen $task visited $host $path r]
658 RobotReadRecord $inf xurl olddistance
660 if {[string length $olddistance] == 0} {
663 if {[expr $distance < $olddistance]} {
664 puts "OK remarking url=$url href=$href"
665 puts "olddistance = $olddistance"
666 puts "newdistance = $distance"
667 set outf [RobotFileOpen $task unvisited $host $path]
668 RobotWriteRecord $outf $url $distance
674 proc RobotTextHtml {task url out} {
677 # set title so we can emit it for the body
679 # if true, nothing will be indexed
681 # if true, nothing will be followed
686 if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
687 set fdistance $URL($task,$url,dist)
688 set distance [expr $fdistance + 1]
690 htmlSwitch $URL($task,$url,buf) \
694 # collect metadata and save NAME= CONTENT=..
697 puts -nonewline $out "<meta"
698 set al [array names parm]
700 set al [string tolower $a]
701 puts -nonewline $out " $al"
702 puts -nonewline $out {="}
703 puts -nonewline $out $parm($a)
704 puts -nonewline $out {"}
707 set metaname [string tolower $parm($a)]
710 set metacontent $parm($a)
716 # go through robots directives (af any)
717 if {![string compare $metaname robots]} {
718 set direcs [split [string tolower $metacontent] ,]
719 if {[lsearch $direcs noindex] >= 0} {
722 if {[lsearch $direcs nofollow] >= 0} {
727 # don't print title of document content if noindex is used
729 puts $out "<title>$title</title>"
730 regsub -all {<!--[^-]*-->} $body { } abody
731 regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
732 regsub -all {<[^\>]+>} $bbody {} nbody
733 puts $out "<documentcontent>"
735 puts $out "</documentcontent>"
739 if {![info exists parm(href)]} {
742 set href [string trim $parm(href)]
743 if {![RobotHref $task $url href host path]} continue
744 set URL($task,$url,bpath) $path
746 # <a href="...."> .. </a>
747 # we're not using nonest - otherwise body isn't set
748 if {$nofollow} continue
749 if {![info exists parm(href)]} {
752 link $task $url $out [string trim $parm(href)] $body $distance
754 if {$nofollow} continue
755 if {![info exists parm(href)]} {
758 link $task $url $out [string trim $parm(href)] $body $distance
760 if {![info exists parm(src)]} {
763 link $task $url $out [string trim $parm(src)] $body $fdistance
767 proc RobotsTxt {task url} {
770 RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf)
773 proc RobotsTxt0 {task v buf} {
776 foreach l [split $buf \n] {
777 if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
778 set arg [string trim $arg]
779 puts "cmd=$cmd arg=$arg"
780 switch -- [string tolower $cmd] {
783 set pat [string tolower $arg]*
784 set section [string match $pat $agent]
788 puts "rule [list 0 $arg]"
789 lappend $v [list 0 $arg]
794 puts "rule [list 1 $arg]"
795 lappend $v [list 1 $arg]
803 proc RobotTextPlain {task url out} {
806 puts $out "<documentcontent>"
807 regsub -all {<} $URL($task,$url,buf) {\<} content
809 puts $out "</documentcontent>"
811 if {![string compare $URL($task,$url,path) /robots.txt]} {
816 proc RobotWriteMetadata {task url out} {
822 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
823 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
824 RobotReadRecord $inf fromurl distance
827 set URL($task,$url,dist) $distance
828 puts $out "<distance>"
829 puts $out " $distance"
830 puts $out "</distance>"
831 headSave $task $url $out
832 puts "Parsing $url distance=$distance"
833 switch $URL($task,$url,head,content-type) {
835 if {[string length $distance]} {
836 RobotTextHtml $task $url $out
840 RobotTextPlain $task $url $out
846 proc Robot200 {task url} {
849 set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
850 puts -nonewline $out $URL($task,$url,buf)
853 set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
854 RobotWriteMetadata $task $url $out
857 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
860 proc RobotReadContent {task url sock binary} {
863 set buffer [read $sock 16384]
864 set readCount [string length $buffer]
866 if {$readCount <= 0} {
868 RobotRestart $task $url $sock
869 } elseif {!$binary && [string first \0 $buffer] >= 0} {
871 RobotRestart $task $url $sock
873 # puts "Got $readCount bytes"
874 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
878 proc RobotReadHeader {task url sock} {
879 global URL debuglevel
881 if {$debuglevel > 1} {
882 puts "HTTP head $url"
884 if {[catch {set buffer [read $sock 2148]}]} {
885 RobotError $task $url 404
886 RobotRestart $task $url $sock
889 set readCount [string length $buffer]
891 if {$readCount <= 0} {
892 RobotError $task $url 404
893 RobotRestart $task $url $sock
895 # puts "Got $readCount bytes"
896 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
898 set n [string first \r\n\r\n $URL($task,$url,buf)]
902 set headbuf [string range $URL($task,$url,buf) 0 $n]
904 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
906 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
907 set lines [split $headbuf \n]
908 foreach line $lines {
909 if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
910 set URL($task,$url,head,[string tolower $name]) [string trim $value]
913 puts "HTTP CODE $code"
914 set URL($task,$url,state) skip
917 RobotRedirect $task $url $URL($task,$url,head,location) 301
918 RobotRestart $task $url $sock
921 RobotRedirect $task $url $URL($task,$url,head,location) 302
922 RobotRestart $task $url $sock
925 if {![info exists URL($task,$url,head,content-type)]} {
926 set URL($task,$url,head,content-type) {}
929 switch -glob -- $URL($task,$url,head,content-type) {
934 if {![regexp {/robots.txt$} $url]} {
935 if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
936 RobotError $task $url mimedeny
937 RobotRestart $task $url $sock
941 fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
944 RobotError $task $url $code
945 RobotRestart $task $url $sock
952 proc RobotSockCancel {task url sock} {
954 puts "RobotSockCancel sock=$sock url=$url"
955 RobotError $task $url 401
956 RobotRestart $task $url $sock
959 proc RobotConnect {task url sock} {
960 global URL agent acceptLanguage
962 fconfigure $sock -translation {lf crlf} -blocking 0
963 fileevent $sock readable [list RobotReadHeader $task $url $sock]
964 puts $sock "GET $URL($task,$url,path) HTTP/1.0"
965 puts $sock "Host: $URL($task,$url,host)"
966 puts $sock "User-Agent: $agent"
967 if {[string length $acceptLanguage]} {
968 puts $sock "Accept-Language: $acceptLanguage"
971 set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
972 if {[catch {flush $sock}]} {
973 RobotError $task $url 404
974 RobotRestart $task $url $sock
982 proc RobotGetUrl {task url phost} {
983 global URL robotsRunning
985 puts "Retrieve running=$robotsRunning url=$url task=$task"
986 if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
989 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
993 set URL($task,$url,method) $method
994 set URL($task,$url,host) $host
995 set URL($task,$url,hostport) $hostport
996 set URL($task,$url,path) $path
997 set URL($task,$url,state) head
998 set URL($task,$url,buf) {}
1000 if {[string compare $path /robots.txt]} {
1002 if {![info exists URL($hostport,robots)]} {
1003 puts "READING robots.txt for host $hostport"
1004 if {[RobotFileExist $task visited $hostport /robots.txt]} {
1005 set inf [RobotFileOpen $task visited $hostport /robots.txt r]
1006 set buf [read $inf 32768]
1009 set buf "User-agent: *\nAllow: /\n"
1011 RobotsTxt0 $task URL($hostport,robots) $buf
1013 if {[info exists URL($hostport,robots)]} {
1014 foreach l $URL($hostport,robots) {
1015 if {[string first [lindex $l 1] $path] == 0} {
1016 set ok [lindex $l 0]
1022 puts "skipped due to robots.txt"
1026 if [catch {set sock [socket -async $host $port]}] {
1029 RobotConnect $task $url $sock
1034 if {![llength [info commands htmlSwitch]]} {
1035 set e [info sharedlibextension]
1036 if {[catch {load ./tclrobot$e}]} {
1041 set agent "zmbot/0.2"
1042 if {![catch {set os [exec uname -s -r]}]} {
1043 set agent "$agent ($os)"
1046 puts "agent: $agent"
1057 set acceptLanguage {}
1060 # Rules: allow, deny, url
1062 proc checkrule {task type this} {
1068 if {$debuglevel > 3} {
1069 puts "CHECKRULE $type $this"
1071 if {[info exist control($task,alrules)]} {
1072 foreach l $control($task,alrules) {
1073 if {$debuglevel > 3} {
1077 if {[lindex $l 1] != $type} continue
1078 # consider mask (! negates)
1079 set masks [lindex $l 2]
1082 foreach mask $masks {
1083 if {$debuglevel > 4} {
1084 puts "consider single mask $mask"
1086 if {[string index $mask 0] == "!"} {
1087 set mask [string range $mask 1 end]
1088 if {[string match $mask $this]} continue
1090 if {![string match $mask $this]} continue
1094 if {$debuglevel > 4} {
1098 # OK, we have a match
1099 if {[lindex $l 0] == "allow"} {
1100 if {$debuglevel > 3} {
1101 puts "CHECKRULE MATCH OK"
1105 if {$debuglevel > 3} {
1106 puts "CHECKFULE MATCH FAIL"
1112 if {$debuglevel > 3} {
1113 puts "CHECKRULE MATCH DEFAULT $default_ret"
1120 global debuglevel task
1122 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1123 if {![RobotFileExist $task visited $host $path]} {
1124 set outf [RobotFileOpen $task unvisited $host $path]
1125 RobotWriteRecord $outf href 0
1126 RobotFileClose $outf
1131 proc deny {type stuff} {
1134 lappend control($task,alrules) [list deny $type $stuff]
1137 proc allow {type stuff} {
1140 lappend control($task,alrules) [list allow $type $stuff]
1143 proc debug {level} {
1146 set debuglevel $level
1150 global tasks task status control
1154 if {[info exists tasks]} {
1155 if {[lsearch -exact $tasks $t] >= 0} {
1161 set status($t,unvisited) 0
1162 set status($t,visited) 0
1163 set status($t,bad) 0
1164 set status($t,raw) 0
1165 set status($t,active) 1
1166 set control($t,seq) 0
1167 set control($t,distance) 10
1171 # Little utility that ensures that at least one task is present (main).
1174 if {![info exist tasks]} {
1183 set l [llength $argv]
1186 puts {tclrobot: usage:}
1187 puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-D dir] [-r rules] [url ..]}
1188 puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
1195 set arg [lindex $argv $i]
1196 switch -glob -- $arg {
1198 set t [string range $arg 2 end]
1199 if {![string length $t]} {
1200 set t [lindex $argv [incr i]]
1205 set dir [string range $arg 2 end]
1206 if {![string length $dir]} {
1207 set dir [lindex $argv [incr i]]
1209 lappend daemon_dir $dir
1212 set robotsMax [string range $arg 2 end]
1213 if {![string length $robotsMax]} {
1214 set robotsMax [lindex $argv [incr i]]
1219 set control($task,distance) [string range $arg 2 end]
1220 if {![string length $control($task,distance)]} {
1221 set control($task,distance) [lindex $argv [incr i]]
1226 set dom [string range $arg 2 end]
1227 if {![string length $dom]} {
1228 set dom [lindex $argv [incr i]]
1230 lappend control($task,domains) $dom
1233 set idletime [string range $arg 2 end]
1234 if {![string length $idletime]} {
1235 set idletime [lindex $argv [incr i]]
1240 set acceptLanguage [string range $arg 2 end]
1241 if {![string length $acceptLanguage]} {
1242 set acceptLanguage [lindex $argv [incr i]]
1247 set rfile [string range $arg 2 end]
1248 if {![string length $rfile]} {
1249 set rfile [lindex $argv [incr i]]
1251 catch {unset maxdistance}
1253 if {[info exists maxdistance]} {
1254 set control($task,distance) $maxdistance
1260 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1261 if {![RobotFileExist $task visited $host $path]} {
1262 set outf [RobotFileOpen $task unvisited $host $path]
1263 RobotWriteRecord $outf href 0
1264 RobotFileClose $outf
1272 if {![info exist robotsMax]} {
1276 if {[info exist daemon_dir]} {
1281 puts "max distance=$control($t,distance)"
1282 if {[info exists control($t,domains)]} {
1283 puts "domains=$control($t,domains)"
1286 puts "max jobs=$robotsMax"
1292 while {$robotsRunning} {
1296 if {[info exists tasks]} {
1298 set statusfile [open $t/status w]
1299 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"