2 # $Id: robot.tcl,v 1.43 2003/06/11 09:40:22 adam Exp $
4 proc RobotFileNext1 {area lead} {
5 # puts "RobotFileNext1 area=$area lead=$lead"
6 if {[catch {set ns [glob ${area}/*]}]} {
10 if {[file isfile $n]} {
11 set off [string last / $n]
14 set end [string length $n]
17 return $lead/[string range $n $off $end]
21 if {[file isdirectory $n]} {
22 set off [string last / $n]
25 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
26 if {[string length $sb]} {
34 proc RobotWriteRecord {outf fromurl distance} {
36 puts $outf "<distance>"
38 puts $outf "</distance>"
39 puts $outf "<fromurl>"
41 puts $outf "</fromurl>"
45 proc RobotReadRecord {inf fromurlx distancex} {
46 upvar $fromurlx fromurl
47 upvar $distancex distance
50 set distance [string trim [gets $inf]]
51 # puts "got distance = $distance"
54 set fromurl [string trim [gets $inf]]
57 proc RobotFileNext {task area} {
62 # puts "RobotFileNext seq=$control($task,seq)"
63 if {$control($task,seq) < 0} {
66 if {$control($task,seq) == 0} {
67 if {[catch {set ns($task) [glob $task/$area/*]}]} {
71 # puts "ns=$ns($task)"
72 set off [string length $task/$area]
74 set n [lindex $ns($task) $control($task,seq)]
76 if {![string length $n]} {
77 set control($task,seq) -1
79 set statusfile [open $task/status w]
80 puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
84 incr control($task,seq)
85 if {[file isfile $n/robots.txt_.tkl]} {
86 # puts "ok returning http://[string range $n $off end]/robots.txt"
87 return http://[string range $n $off end]/robots.txt
88 } elseif {[file isdirectory $n]} {
89 set sb [RobotFileNext1 $n http://[string range $n $off end]]
90 if {[string length $sb]} {
94 puts "no more work at end of RobotFileNext n=$n"
100 proc RobotFileExist {task area host path} {
103 if {$debuglevel > 3} {
104 puts "RobotFileExist begin area=$area host=$host path=$path"
106 return [file exists $task/$area/$host${path}_.tkl]
109 proc RobotFileUnlink {task area host path} {
111 # puts "RobotFileUnlink begin"
112 # puts "area=$area host=$host path=$path"
113 set npath $task/$area/$host${path}_.tkl
114 # puts "npath=$npath"
115 set comp [split $npath /]
116 if {[catch {exec rm $npath}]} return
118 set l [llength $comp]
120 incr status($task,$area) -1
121 for {set i $l} {$i > 0} {incr i -1} {
122 set path [join [lrange $comp 0 $i] /]
123 if {![catch {glob $path/*}]} return
126 # puts "RobotFileUnlink end"
129 proc RobotFileClose {out} {
130 if [string compare $out stdout] {
135 proc RobotFileOpen {task area host path {mode w}} {
141 # puts "RobotFileOpen task=$task path=$path"
143 if {![info exists workdir]} {
146 if {$debuglevel > 3} {
147 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
149 if {[string compare $orgPwd $workdir]} {
150 puts "ooops. RobotFileOpen failed"
151 puts "workdir = $workdir"
156 set comp [split $task/$area/$host /]
157 set len [llength $comp]
160 # puts "1 comp=$comp"
162 for {set i 0} {$i <= $len} {incr i} {
163 set d [lindex $comp $i]
164 if {[string length $d] == 0} {
166 } elseif {[catch {cd $d}]} {
169 if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
170 if {[string compare $path /robots.txt]} {
171 set out [open robots.txt_.tkl w]
172 puts "creating robots.txt in $d"
174 incr status($task,unvisited)
180 set comp [split $path /]
181 set len [llength $comp]
184 # puts "2 path=$path comp=$comp"
186 for {set i 0} {$i < $len} {incr i} {
187 set d [lindex $comp $i]
188 if {[string length $d] > 0} {
189 if {[catch {cd $d}]} {
195 set d [lindex $comp $len]
196 set out [open ${d}_.tkl $mode]
198 incr status($task,$area)
204 proc RobotStartJob {fname t} {
207 set f [open $fname r]
209 puts "Reading $fname"
211 if {![regexp {<status>([^<]*)</status>} $xml x status]} {
214 if {$status == "done"} {
218 puts "status = $status"
227 set action $parm(action)
228 if {$type == "domain"} {
229 $action url http://$body/*
231 if {$type == "url"} {
234 if {$type == "mime"} {
238 set control($t,distance) $body
240 set control($t,filestatus) $body
242 if {$status == "pending"} {
243 regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
244 set f [open $fname w]
245 puts -nonewline $f $xml2
250 proc RobotDoneJob {t} {
253 if {![info exists daemon_dir]} {
259 set f [open $fname r]
261 puts "Reading $fname"
262 regexp {<status>([^<]*)</status>} $xml x status
264 puts "status = $status"
267 regsub {<status>[^<]*</status>} $xml {<status>done</status>} xml2
268 set f [open $fname w]
269 puts -nonewline $f $xml2
273 proc RobotScanDir {} {
276 if {![info exists daemon_dir]} {
279 foreach d $daemon_dir {
280 if {[catch {set files [glob $d/*.tkl]}]} {
283 foreach fname $files {
284 if {[file isfile $fname] && [file readable $fname]} {
285 set t [file rootname $fname]
286 RobotStartJob $fname $t
292 proc RobotRR {task} {
293 global control robotsRunning tasks robotsMax status
295 puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
296 incr robotsRunning -1
298 # only one task gets through...
299 if {[string compare [lindex $tasks 0] $task]} {
302 puts "RobotRR. task = $task"
303 while {$robotsRunning} {
307 if {[catch {RobotScanDir} msg]} {
308 puts "RobotScanDir failed"
312 set statusfile [open $t/status w]
313 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
315 set control($t,seq) 0
320 proc RobotDaemonSig {} {
326 proc RobotDaemonLoop {} {
327 global daemon_cnt tasks robotsRunning status
335 if {[info exists tasks]} {
336 puts "daemon loop tasks $tasks"
338 set control($t,seq) 0
341 while {$robotsRunning} {
345 after 30000 RobotDaemonSig
350 proc RobotRestart {task url sock} {
351 global URL robotsRunning
354 after cancel $URL($sock,cancel)
356 foreach v [array names URL $task,$url,*] {
360 incr robotsRunning -1
364 proc RobotStart {task} {
366 global robotsRunning robotsMax idletime status tasks
368 # puts "RobotStart $task running=$robotsRunning"
370 set url [RobotFileNext $task unvisited]
371 if {[string compare $url done] == 0} {
372 puts "In RobotStart task $task done"
376 if {[string compare $t $task]} {
382 if {![info exists ntasks]} {
391 if {![string length $url]} {
395 if {[string compare $url wait] == 0} {
396 after $idletime [list RobotRR $task]
399 set r [RobotGetUrl $task $url {}]
401 if {$robotsRunning >= $robotsMax} return
403 incr robotsRunning -1
404 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
405 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
408 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
413 proc headSave {task url out} {
416 if {[info exists URL($task,$url,head,last-modified)]} {
417 puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
420 if {[info exists URL($task,$url,head,date)]} {
421 puts $out " <date>$URL($task,$url,head,date)</date>"
423 if {[info exists URL($task,$url,head,content-length)]} {
424 puts $out " <by>$URL($task,$url,head,content-length)</by>"
426 if {[info exists URL($task,$url,head,server)]} {
427 puts $out " <format>$URL($task,$url,head,server)</format>"
430 puts $out {<publisher>}
431 puts $out " <identifier>$url</identifier>"
432 if {[info exists URL($task,$url,head,content-type)]} {
433 puts $out " <type>$URL($task,$url,head,content-type)</type>"
435 puts $out {</publisher>}
438 proc RobotHref {task url hrefx hostx pathx} {
439 global URL control debuglevel
444 if {$debuglevel > 1} {
445 puts "Ref input url = $url href=$href"
448 if {[string first { } $href] >= 0} {
451 if {[string length $href] > 256} {
455 # Skip pages that have ? in them
456 # if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
459 # get method (if any)
460 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
464 if {[string compare $method http]} {
469 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
470 if {![string length $surl]} {
473 if {[info exist control($task,domains)]} {
475 foreach domain $control($task,domains) {
476 if {[string match $domain $host]} {
486 regexp {^([^\#]*)} $hpath x surl
487 set host $URL($task,$url,hostport)
489 if {![string length $surl]} {
492 if {[string first / $surl]} {
494 set curpath $URL($task,$url,path)
495 if {[info exists URL($task,$url,bpath)]} {
496 set curpath $URL($task,$url,bpath)
498 regexp {^([^\#?]*)} $curpath x dpart
499 set l [string last / $dpart]
500 if {[expr $l >= 0]} {
501 set surl [string range $dpart 0 $l]$surl
503 set surl $dpart/$surl
506 set surllist [split $surl /]
509 foreach c $surllist {
514 set path [lrange $path 0 $pathl]
527 if {$debuglevel > 4} {
528 puts "pathl=$pathl output path=$path"
530 set path [join $path /]
531 if {![string length $path]} {
534 regsub -all {~} $path {%7E} path
535 set href "$method://$host$path"
537 if {$debuglevel > 1} {
538 puts "Ref result = $href"
540 return [checkrule $task url $href]
543 proc RobotError {task url code} {
546 puts "Bad URL $url (code $code)"
549 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
550 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
551 RobotReadRecord $inf fromurl distance
554 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
555 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
556 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
557 RobotWriteRecord $outf $fromurl $distance
562 proc RobotRedirect {task url tourl code} {
565 puts "Redirecting from $url to $tourl"
569 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
570 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
571 RobotReadRecord $inf fromurl distance
574 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
575 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
576 RobotWriteRecord $outf $fromurl $distance
579 if {[RobotHref $task $url tourl host path]} {
580 if {![RobotFileExist $task visited $host $path]} {
581 if {![RobotFileExist $task unvisited $host $path]} {
582 set outf [RobotFileOpen $task unvisited $host $path]
583 RobotWriteRecord $outf $fromurl $distance
588 set inf [RobotFileOpen $task visited $host $path r]
589 RobotReadRecord $inf oldurl olddistance
591 if {[string length $olddistance] == 0} {
594 if {[string length $distance] == 0} {
597 puts "distance=$distance olddistance=$olddistance"
598 if {[expr $distance < $olddistance]} {
599 set outf [RobotFileOpen $task unvisited $host $path]
600 RobotWriteRecord $outf $tourl $distance
605 if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
611 proc link {task url out href body distance} {
613 if {[expr $distance > $control($task,distance)]} return
615 if {![RobotHref $task $url href host path]} return
618 puts $out "<identifier>$href</identifier>"
619 puts $out "<description>$body</description>"
622 if {![RobotFileExist $task visited $host $path]} {
624 if {![RobotFileExist $task bad $host $path]} {
625 if {[RobotFileExist $task unvisited $host $path]} {
626 set inf [RobotFileOpen $task unvisited $host $path r]
627 RobotReadRecord $inf oldurl olddistance
633 if {[string length $olddistance] == 0} {
636 if {[expr $distance < $olddistance]} {
637 set outf [RobotFileOpen $task unvisited $host $path]
638 RobotWriteRecord $outf $url $distance
641 } elseif {[string compare $href $url]} {
642 set inf [RobotFileOpen $task visited $host $path r]
643 RobotReadRecord $inf xurl olddistance
645 if {[string length $olddistance] == 0} {
648 if {[expr $distance < $olddistance]} {
649 puts "OK remarking url=$url href=$href"
650 puts "olddistance = $olddistance"
651 puts "newdistance = $distance"
652 set outf [RobotFileOpen $task unvisited $host $path]
653 RobotWriteRecord $outf $url $distance
659 proc RobotTextHtml {task url out} {
662 # set title so we can emit it for the body
664 # if true, nothing will be indexed
666 # if true, nothing will be followed
671 if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
672 set fdistance $URL($task,$url,dist)
673 set distance [expr $fdistance + 1]
675 htmlSwitch $URL($task,$url,buf) \
679 # collect metadata and save NAME= CONTENT=..
682 puts -nonewline $out "<meta"
683 set al [array names parm]
685 set al [string tolower $a]
686 puts -nonewline $out " $al"
687 puts -nonewline $out {="}
688 puts -nonewline $out $parm($a)
689 puts -nonewline $out {"}
692 set metaname [string tolower $parm($a)]
695 set metacontent $parm($a)
701 # go through robots directives (af any)
702 if {![string compare $metaname robots]} {
703 set direcs [split [string tolower $metacontent] ,]
704 if {[lsearch $direcs noindex] >= 0} {
707 if {[lsearch $direcs nofollow] >= 0} {
712 # don't print title of document content if noindex is used
714 puts $out "<title>$title</title>"
715 regsub -all {<!--[^-]*-->} $body { } abody
716 regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
717 regsub -all {<[^\>]+>} $bbody {} nbody
718 puts $out "<documentcontent>"
720 puts $out "</documentcontent>"
724 if {![info exists parm(href)]} {
727 set href [string trim $parm(href)]
728 if {![RobotHref $task $url href host path]} continue
729 set URL($task,$url,bpath) $path
731 # <a href="...."> .. </a>
732 # we're not using nonest - otherwise body isn't set
733 if {$nofollow} continue
734 if {![info exists parm(href)]} {
737 link $task $url $out [string trim $parm(href)] $body $distance
739 if {$nofollow} continue
740 if {![info exists parm(href)]} {
743 link $task $url $out [string trim $parm(href)] $body $distance
745 if {![info exists parm(src)]} {
748 link $task $url $out [string trim $parm(src)] $body $fdistance
752 proc RobotsTxt {task url} {
755 RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf)
758 proc RobotsTxt0 {task v buf} {
761 foreach l [split $buf \n] {
762 if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
763 set arg [string trim $arg]
764 puts "cmd=$cmd arg=$arg"
765 switch -- [string tolower $cmd] {
768 set pat [string tolower $arg]*
769 set section [string match $pat $agent]
773 puts "rule [list 0 $arg]"
774 lappend $v [list 0 $arg]
779 puts "rule [list 1 $arg]"
780 lappend $v [list 1 $arg]
788 proc RobotTextPlain {task url out} {
791 puts $out "<documentcontent>"
792 regsub -all {<} $URL($task,$url,buf) {\<} content
794 puts $out "</documentcontent>"
796 if {![string compare $URL($task,$url,path) /robots.txt]} {
801 proc RobotWriteMetadata {task url out} {
807 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
808 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
809 RobotReadRecord $inf fromurl distance
812 set URL($task,$url,dist) $distance
813 puts $out "<distance>"
814 puts $out " $distance"
815 puts $out "</distance>"
816 headSave $task $url $out
817 puts "Parsing $url distance=$distance"
818 switch $URL($task,$url,head,content-type) {
820 if {[string length $distance]} {
821 RobotTextHtml $task $url $out
825 RobotTextPlain $task $url $out
831 proc Robot200 {task url} {
834 set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
835 puts -nonewline $out $URL($task,$url,buf)
838 set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
839 RobotWriteMetadata $task $url $out
842 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
845 proc RobotReadContent {task url sock binary} {
848 set buffer [read $sock 16384]
849 set readCount [string length $buffer]
851 if {$readCount <= 0} {
853 RobotRestart $task $url $sock
854 } elseif {!$binary && [string first \0 $buffer] >= 0} {
856 RobotRestart $task $url $sock
858 # puts "Got $readCount bytes"
859 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
863 proc RobotReadHeader {task url sock} {
864 global URL debuglevel
866 if {$debuglevel > 1} {
867 puts "HTTP head $url"
869 if {[catch {set buffer [read $sock 2148]}]} {
870 RobotError $task $url 404
871 RobotRestart $task $url $sock
874 set readCount [string length $buffer]
876 if {$readCount <= 0} {
877 RobotError $task $url 404
878 RobotRestart $task $url $sock
880 # puts "Got $readCount bytes"
881 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
883 set n [string first \r\n\r\n $URL($task,$url,buf)]
887 set headbuf [string range $URL($task,$url,buf) 0 $n]
889 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
891 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
892 set lines [split $headbuf \n]
893 foreach line $lines {
894 if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
895 set URL($task,$url,head,[string tolower $name]) [string trim $value]
898 puts "HTTP CODE $code"
899 set URL($task,$url,state) skip
902 RobotRedirect $task $url $URL($task,$url,head,location) 301
903 RobotRestart $task $url $sock
906 RobotRedirect $task $url $URL($task,$url,head,location) 302
907 RobotRestart $task $url $sock
910 if {![info exists URL($task,$url,head,content-type)]} {
911 set URL($task,$url,head,content-type) {}
914 switch -glob -- $URL($task,$url,head,content-type) {
919 if {![regexp {/robots.txt$} $url]} {
920 if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
921 RobotError $task $url mimedeny
922 RobotRestart $task $url $sock
926 fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
929 RobotError $task $url $code
930 RobotRestart $task $url $sock
937 proc RobotSockCancel {task url sock} {
939 puts "RobotSockCancel sock=$sock url=$url"
940 RobotError $task $url 401
941 RobotRestart $task $url $sock
944 proc RobotConnect {task url sock} {
945 global URL agent acceptLanguage
947 fconfigure $sock -translation {lf crlf} -blocking 0
948 fileevent $sock readable [list RobotReadHeader $task $url $sock]
949 puts $sock "GET $URL($task,$url,path) HTTP/1.0"
950 puts $sock "Host: $URL($task,$url,host)"
951 puts $sock "User-Agent: $agent"
952 if {[string length $acceptLanguage]} {
953 puts $sock "Accept-Language: $acceptLanguage"
956 set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
957 if {[catch {flush $sock}]} {
958 RobotError $task $url 404
959 RobotRestart $task $url $sock
967 proc RobotGetUrl {task url phost} {
968 global URL robotsRunning
970 puts "Retrieve running=$robotsRunning url=$url task=$task"
971 if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
974 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
978 set URL($task,$url,method) $method
979 set URL($task,$url,host) $host
980 set URL($task,$url,hostport) $hostport
981 set URL($task,$url,path) $path
982 set URL($task,$url,state) head
983 set URL($task,$url,buf) {}
985 if {[string compare $path /robots.txt]} {
987 if {![info exists URL($hostport,robots)]} {
988 puts "READING robots.txt for host $hostport"
989 if {[RobotFileExist $task visited $hostport /robots.txt]} {
990 set inf [RobotFileOpen $task visited $hostport /robots.txt r]
991 set buf [read $inf 32768]
994 set buf "User-agent: *\nAllow: /\n"
996 RobotsTxt0 $task URL($hostport,robots) $buf
998 if {[info exists URL($hostport,robots)]} {
999 foreach l $URL($hostport,robots) {
1000 if {[string first [lindex $l 1] $path] == 0} {
1001 set ok [lindex $l 0]
1007 puts "skipped due to robots.txt"
1011 if [catch {set sock [socket -async $host $port]}] {
1014 RobotConnect $task $url $sock
1019 if {![llength [info commands htmlSwitch]]} {
1020 set e [info sharedlibextension]
1021 if {[catch {load ./tclrobot$e}]} {
1026 set agent "zmbot/0.2"
1027 if {![catch {set os [exec uname -s -r]}]} {
1028 set agent "$agent ($os)"
1031 puts "agent: $agent"
1042 set acceptLanguage {}
1045 # Rules: allow, deny, url
1047 proc checkrule {task type this} {
1053 if {$debuglevel > 3} {
1054 puts "CHECKRULE $type $this"
1056 if {[info exist control($task,alrules)]} {
1057 foreach l $control($task,alrules) {
1058 if {$debuglevel > 3} {
1062 if {[lindex $l 1] != $type} continue
1063 # consider mask (! negates)
1064 set masks [lindex $l 2]
1067 foreach mask $masks {
1068 if {$debuglevel > 4} {
1069 puts "consider single mask $mask"
1071 if {[string index $mask 0] == "!"} {
1072 set mask [string range $mask 1 end]
1073 if {[string match $mask $this]} continue
1075 if {![string match $mask $this]} continue
1079 if {$debuglevel > 4} {
1083 # OK, we have a match
1084 if {[lindex $l 0] == "allow"} {
1085 if {$debuglevel > 3} {
1086 puts "CHECKRULE MATCH OK"
1090 if {$debuglevel > 3} {
1091 puts "CHECKFULE MATCH FAIL"
1097 if {$debuglevel > 3} {
1098 puts "CHECKRULE MATCH DEFAULT $default_ret"
1105 global debuglevel task
1107 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1108 if {![RobotFileExist $task visited $host $path]} {
1109 set outf [RobotFileOpen $task unvisited $host $path]
1110 RobotWriteRecord $outf href 0
1111 RobotFileClose $outf
1116 proc deny {type stuff} {
1119 lappend control($task,alrules) [list deny $type $stuff]
1122 proc allow {type stuff} {
1125 lappend control($task,alrules) [list allow $type $stuff]
1128 proc debug {level} {
1131 set debuglevel $level
1135 global tasks task status control
1139 if {[info exists tasks]} {
1140 if {[lsearch -exact $tasks $t] >= 0} {
1146 set status($t,unvisited) 0
1147 set status($t,visited) 0
1148 set status($t,bad) 0
1149 set status($t,raw) 0
1150 set status($t,active) 1
1151 set control($t,seq) 0
1152 set control($t,distance) 10
1156 # Little utility that ensures that at least one task is present (main).
1159 if {![info exist tasks]} {
1168 set l [llength $argv]
1171 puts {tclrobot: usage:}
1172 puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-D dir] [-r rules] [url ..]}
1173 puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
1180 set arg [lindex $argv $i]
1181 switch -glob -- $arg {
1183 set t [string range $arg 2 end]
1184 if {![string length $t]} {
1185 set t [lindex $argv [incr i]]
1190 set dir [string range $arg 2 end]
1191 if {![string length $dir]} {
1192 set dir [lindex $argv [incr i]]
1194 lappend daemon_dir $dir
1197 set robotsMax [string range $arg 2 end]
1198 if {![string length $robotsMax]} {
1199 set robotsMax [lindex $argv [incr i]]
1204 set control($task,distance) [string range $arg 2 end]
1205 if {![string length $control($task,distance)]} {
1206 set control($task,distance) [lindex $argv [incr i]]
1211 set dom [string range $arg 2 end]
1212 if {![string length $dom]} {
1213 set dom [lindex $argv [incr i]]
1215 lappend control($task,domains) $dom
1218 set idletime [string range $arg 2 end]
1219 if {![string length $idletime]} {
1220 set idletime [lindex $argv [incr i]]
1225 set acceptLanguage [string range $arg 2 end]
1226 if {![string length $acceptLanguage]} {
1227 set acceptLanguage [lindex $argv [incr i]]
1232 set rfile [string range $arg 2 end]
1233 if {![string length $rfile]} {
1234 set rfile [lindex $argv [incr i]]
1236 catch {unset maxdistance}
1238 if {[info exists maxdistance]} {
1239 set control($task,distance) $maxdistance
1245 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1246 if {![RobotFileExist $task visited $host $path]} {
1247 set outf [RobotFileOpen $task unvisited $host $path]
1248 RobotWriteRecord $outf href 0
1249 RobotFileClose $outf
1257 if {![info exist robotsMax]} {
1261 if {[info exist daemon_dir]} {
1266 puts "max distance=$control($t,distance)"
1267 if {[info exists control($t,domains)]} {
1268 puts "domains=$control($t,domains)"
1271 puts "max jobs=$robotsMax"
1277 while {$robotsRunning} {
1281 if {[info exists tasks]} {
1283 set statusfile [open $t/status w]
1284 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"