2 # $Id: robot.tcl,v 1.44 2003/06/11 10:11:39 adam Exp $
4 proc RobotFileNext1 {area lead} {
5 # puts "RobotFileNext1 area=$area lead=$lead"
6 if {[catch {set ns [glob ${area}/*]}]} {
10 if {[file isfile $n]} {
11 set off [string last / $n]
14 set end [string length $n]
17 return $lead/[string range $n $off $end]
21 if {[file isdirectory $n]} {
22 set off [string last / $n]
25 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
26 if {[string length $sb]} {
34 proc RobotWriteRecord {outf fromurl distance} {
35 puts $outf {<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>}
37 puts $outf "<distance>"
39 puts $outf "</distance>"
40 puts $outf "<fromurl>"
42 puts $outf "</fromurl>"
46 proc RobotReadRecord {inf fromurlx distancex} {
47 upvar $fromurlx fromurl
48 upvar $distancex distance
52 set distance [string trim [gets $inf]]
53 # puts "got distance = $distance"
56 set fromurl [string trim [gets $inf]]
59 proc RobotFileNext {task area} {
64 # puts "RobotFileNext seq=$control($task,seq)"
65 if {$control($task,seq) < 0} {
68 if {$control($task,seq) == 0} {
69 if {[catch {set ns($task) [glob $task/$area/*]}]} {
73 # puts "ns=$ns($task)"
74 set off [string length $task/$area]
76 set n [lindex $ns($task) $control($task,seq)]
78 if {![string length $n]} {
79 set control($task,seq) -1
81 set statusfile [open $task/status w]
82 puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
86 incr control($task,seq)
87 if {[file isfile $n/robots.txt_.tkl]} {
88 # puts "ok returning http://[string range $n $off end]/robots.txt"
89 return http://[string range $n $off end]/robots.txt
90 } elseif {[file isdirectory $n]} {
91 set sb [RobotFileNext1 $n http://[string range $n $off end]]
92 if {[string length $sb]} {
96 puts "no more work at end of RobotFileNext n=$n"
102 proc RobotFileExist {task area host path} {
105 if {$debuglevel > 3} {
106 puts "RobotFileExist begin area=$area host=$host path=$path"
108 return [file exists $task/$area/$host${path}_.tkl]
111 proc RobotFileUnlink {task area host path} {
113 # puts "RobotFileUnlink begin"
114 # puts "area=$area host=$host path=$path"
115 set npath $task/$area/$host${path}_.tkl
116 # puts "npath=$npath"
117 set comp [split $npath /]
118 if {[catch {exec rm $npath}]} return
120 set l [llength $comp]
122 incr status($task,$area) -1
123 for {set i $l} {$i > 0} {incr i -1} {
124 set path [join [lrange $comp 0 $i] /]
125 if {![catch {glob $path/*}]} return
128 # puts "RobotFileUnlink end"
131 proc RobotFileClose {out} {
132 if [string compare $out stdout] {
137 proc RobotFileOpen {task area host path {mode w}} {
143 # puts "RobotFileOpen task=$task path=$path"
145 if {![info exists workdir]} {
148 if {$debuglevel > 3} {
149 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
151 if {[string compare $orgPwd $workdir]} {
152 puts "ooops. RobotFileOpen failed"
153 puts "workdir = $workdir"
158 set comp [split $task/$area/$host /]
159 set len [llength $comp]
162 # puts "1 comp=$comp"
164 for {set i 0} {$i <= $len} {incr i} {
165 set d [lindex $comp $i]
166 if {[string length $d] == 0} {
168 } elseif {[catch {cd $d}]} {
171 if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
172 if {[string compare $path /robots.txt]} {
173 set out [open robots.txt_.tkl w]
174 puts "creating robots.txt in $d"
176 incr status($task,unvisited)
182 set comp [split $path /]
183 set len [llength $comp]
186 # puts "2 path=$path comp=$comp"
188 for {set i 0} {$i < $len} {incr i} {
189 set d [lindex $comp $i]
190 if {[string length $d] > 0} {
191 if {[catch {cd $d}]} {
197 set d [lindex $comp $len]
198 set out [open ${d}_.tkl $mode]
200 incr status($task,$area)
206 proc RobotStartJob {fname t} {
209 set f [open $fname r]
211 puts "Reading $fname"
213 if {![regexp {<status>([^<]*)</status>} $xml x status]} {
216 if {$status == "done"} {
220 puts "status = $status"
229 set action $parm(action)
230 if {$type == "domain"} {
231 $action url http://$body/*
233 if {$type == "url"} {
236 if {$type == "mime"} {
240 set control($t,distance) $body
242 set control($t,filestatus) $body
244 if {$status == "pending"} {
245 regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
246 set f [open $fname w]
247 puts -nonewline $f $xml2
252 proc RobotDoneJob {t} {
255 if {![info exists daemon_dir]} {
261 set f [open $fname r]
263 puts "Reading $fname"
264 regexp {<status>([^<]*)</status>} $xml x status
266 puts "status = $status"
269 regsub {<status>[^<]*</status>} $xml {<status>done</status>} xml2
270 set f [open $fname w]
271 puts -nonewline $f $xml2
275 proc RobotScanDir {} {
278 if {![info exists daemon_dir]} {
281 foreach d $daemon_dir {
282 if {[catch {set files [glob $d/*.tkl]}]} {
285 foreach fname $files {
286 if {[file isfile $fname] && [file readable $fname]} {
287 set t [file rootname $fname]
288 RobotStartJob $fname $t
294 proc RobotRR {task} {
295 global control robotsRunning tasks robotsMax status
297 puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
298 incr robotsRunning -1
300 # only one task gets through...
301 if {[string compare [lindex $tasks 0] $task]} {
304 puts "RobotRR. task = $task"
305 while {$robotsRunning} {
309 if {[catch {RobotScanDir} msg]} {
310 puts "RobotScanDir failed"
314 set statusfile [open $t/status w]
315 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
317 set control($t,seq) 0
322 proc RobotDaemonSig {} {
328 proc RobotDaemonLoop {} {
329 global daemon_cnt tasks robotsRunning status
337 if {[info exists tasks]} {
338 puts "daemon loop tasks $tasks"
340 set control($t,seq) 0
343 while {$robotsRunning} {
347 after 30000 RobotDaemonSig
352 proc RobotRestart {task url sock} {
353 global URL robotsRunning
356 after cancel $URL($sock,cancel)
358 foreach v [array names URL $task,$url,*] {
362 incr robotsRunning -1
366 proc RobotStart {task} {
368 global robotsRunning robotsMax idletime status tasks
370 # puts "RobotStart $task running=$robotsRunning"
372 set url [RobotFileNext $task unvisited]
373 if {[string compare $url done] == 0} {
374 puts "In RobotStart task $task done"
378 if {[string compare $t $task]} {
384 if {![info exists ntasks]} {
393 if {![string length $url]} {
397 if {[string compare $url wait] == 0} {
398 after $idletime [list RobotRR $task]
401 set r [RobotGetUrl $task $url {}]
403 if {$robotsRunning >= $robotsMax} return
405 incr robotsRunning -1
406 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
407 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
410 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
415 proc headSave {task url out} {
418 if {[info exists URL($task,$url,head,last-modified)]} {
419 puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
422 if {[info exists URL($task,$url,head,date)]} {
423 puts $out " <date>$URL($task,$url,head,date)</date>"
425 if {[info exists URL($task,$url,head,content-length)]} {
426 puts $out " <by>$URL($task,$url,head,content-length)</by>"
428 if {[info exists URL($task,$url,head,server)]} {
429 puts $out " <format>$URL($task,$url,head,server)</format>"
432 puts $out {<publisher>}
433 puts $out " <identifier>$url</identifier>"
434 if {[info exists URL($task,$url,head,content-type)]} {
435 puts $out " <type>$URL($task,$url,head,content-type)</type>"
437 puts $out {</publisher>}
440 proc RobotHref {task url hrefx hostx pathx} {
441 global URL control debuglevel
446 if {$debuglevel > 1} {
447 puts "Ref input url = $url href=$href"
450 if {[string first { } $href] >= 0} {
453 if {[string length $href] > 256} {
457 # Skip pages that have ? in them
458 # if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
461 # get method (if any)
462 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
466 if {[string compare $method http]} {
471 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
472 if {![string length $surl]} {
475 if {[info exist control($task,domains)]} {
477 foreach domain $control($task,domains) {
478 if {[string match $domain $host]} {
488 regexp {^([^\#]*)} $hpath x surl
489 set host $URL($task,$url,hostport)
491 if {![string length $surl]} {
494 if {[string first / $surl]} {
496 set curpath $URL($task,$url,path)
497 if {[info exists URL($task,$url,bpath)]} {
498 set curpath $URL($task,$url,bpath)
500 regexp {^([^\#?]*)} $curpath x dpart
501 set l [string last / $dpart]
502 if {[expr $l >= 0]} {
503 set surl [string range $dpart 0 $l]$surl
505 set surl $dpart/$surl
508 set surllist [split $surl /]
511 foreach c $surllist {
516 set path [lrange $path 0 $pathl]
529 if {$debuglevel > 4} {
530 puts "pathl=$pathl output path=$path"
532 set path [join $path /]
533 if {![string length $path]} {
536 regsub -all {~} $path {%7E} path
537 set href "$method://$host$path"
539 if {$debuglevel > 1} {
540 puts "Ref result = $href"
542 return [checkrule $task url $href]
545 proc RobotError {task url code} {
548 puts "Bad URL $url (code $code)"
551 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
552 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
553 RobotReadRecord $inf fromurl distance
556 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
557 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
558 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
559 RobotWriteRecord $outf $fromurl $distance
564 proc RobotRedirect {task url tourl code} {
567 puts "Redirecting from $url to $tourl"
571 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
572 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
573 RobotReadRecord $inf fromurl distance
576 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
577 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
578 RobotWriteRecord $outf $fromurl $distance
581 if {[RobotHref $task $url tourl host path]} {
582 if {![RobotFileExist $task visited $host $path]} {
583 if {![RobotFileExist $task unvisited $host $path]} {
584 set outf [RobotFileOpen $task unvisited $host $path]
585 RobotWriteRecord $outf $fromurl $distance
590 set inf [RobotFileOpen $task visited $host $path r]
591 RobotReadRecord $inf oldurl olddistance
593 if {[string length $olddistance] == 0} {
596 if {[string length $distance] == 0} {
599 puts "distance=$distance olddistance=$olddistance"
600 if {[expr $distance < $olddistance]} {
601 set outf [RobotFileOpen $task unvisited $host $path]
602 RobotWriteRecord $outf $tourl $distance
607 if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
613 proc link {task url out href body distance} {
615 if {[expr $distance > $control($task,distance)]} return
617 if {![RobotHref $task $url href host path]} return
620 puts $out "<identifier>$href</identifier>"
621 puts $out "<description>$body</description>"
624 if {![RobotFileExist $task visited $host $path]} {
626 if {![RobotFileExist $task bad $host $path]} {
627 if {[RobotFileExist $task unvisited $host $path]} {
628 set inf [RobotFileOpen $task unvisited $host $path r]
629 RobotReadRecord $inf oldurl olddistance
635 if {[string length $olddistance] == 0} {
638 if {[expr $distance < $olddistance]} {
639 set outf [RobotFileOpen $task unvisited $host $path]
640 RobotWriteRecord $outf $url $distance
643 } elseif {[string compare $href $url]} {
644 set inf [RobotFileOpen $task visited $host $path r]
645 RobotReadRecord $inf xurl olddistance
647 if {[string length $olddistance] == 0} {
650 if {[expr $distance < $olddistance]} {
651 puts "OK remarking url=$url href=$href"
652 puts "olddistance = $olddistance"
653 puts "newdistance = $distance"
654 set outf [RobotFileOpen $task unvisited $host $path]
655 RobotWriteRecord $outf $url $distance
661 proc RobotTextHtml {task url out} {
664 # set title so we can emit it for the body
666 # if true, nothing will be indexed
668 # if true, nothing will be followed
673 if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
674 set fdistance $URL($task,$url,dist)
675 set distance [expr $fdistance + 1]
677 htmlSwitch $URL($task,$url,buf) \
681 # collect metadata and save NAME= CONTENT=..
684 puts -nonewline $out "<meta"
685 set al [array names parm]
687 set al [string tolower $a]
688 puts -nonewline $out " $al"
689 puts -nonewline $out {="}
690 puts -nonewline $out $parm($a)
691 puts -nonewline $out {"}
694 set metaname [string tolower $parm($a)]
697 set metacontent $parm($a)
703 # go through robots directives (af any)
704 if {![string compare $metaname robots]} {
705 set direcs [split [string tolower $metacontent] ,]
706 if {[lsearch $direcs noindex] >= 0} {
709 if {[lsearch $direcs nofollow] >= 0} {
714 # don't print title of document content if noindex is used
716 puts $out "<title>$title</title>"
717 regsub -all {<!--[^-]*-->} $body { } abody
718 regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
719 regsub -all {<[^\>]+>} $bbody {} nbody
720 puts $out "<documentcontent>"
722 puts $out "</documentcontent>"
726 if {![info exists parm(href)]} {
729 set href [string trim $parm(href)]
730 if {![RobotHref $task $url href host path]} continue
731 set URL($task,$url,bpath) $path
733 # <a href="...."> .. </a>
734 # we're not using nonest - otherwise body isn't set
735 if {$nofollow} continue
736 if {![info exists parm(href)]} {
739 link $task $url $out [string trim $parm(href)] $body $distance
741 if {$nofollow} continue
742 if {![info exists parm(href)]} {
745 link $task $url $out [string trim $parm(href)] $body $distance
747 if {![info exists parm(src)]} {
750 link $task $url $out [string trim $parm(src)] $body $fdistance
754 proc RobotsTxt {task url} {
757 RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf)
760 proc RobotsTxt0 {task v buf} {
763 foreach l [split $buf \n] {
764 if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
765 set arg [string trim $arg]
766 puts "cmd=$cmd arg=$arg"
767 switch -- [string tolower $cmd] {
770 set pat [string tolower $arg]*
771 set section [string match $pat $agent]
775 puts "rule [list 0 $arg]"
776 lappend $v [list 0 $arg]
781 puts "rule [list 1 $arg]"
782 lappend $v [list 1 $arg]
790 proc RobotTextPlain {task url out} {
793 puts $out "<documentcontent>"
794 regsub -all {<} $URL($task,$url,buf) {\<} content
796 puts $out "</documentcontent>"
798 if {![string compare $URL($task,$url,path) /robots.txt]} {
803 proc RobotWriteMetadata {task url out} {
806 set charset $URL($task,$url,charset)
807 puts $out "<?xml version=\"1.0\" encoding=\"$charset\" standalone=\"yes\"?>"
811 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
812 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
813 RobotReadRecord $inf fromurl distance
816 set URL($task,$url,dist) $distance
817 puts $out "<distance>"
818 puts $out " $distance"
819 puts $out "</distance>"
820 headSave $task $url $out
821 puts "Parsing $url distance=$distance"
822 switch $URL($task,$url,head,content-type) {
824 if {[string length $distance]} {
825 RobotTextHtml $task $url $out
829 RobotTextPlain $task $url $out
835 proc Robot200 {task url} {
838 set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
839 puts -nonewline $out $URL($task,$url,buf)
842 set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
843 RobotWriteMetadata $task $url $out
846 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
849 proc RobotReadContent {task url sock binary} {
852 set buffer [read $sock 16384]
853 set readCount [string length $buffer]
855 if {$readCount <= 0} {
857 RobotRestart $task $url $sock
858 } elseif {!$binary && [string first \0 $buffer] >= 0} {
860 RobotRestart $task $url $sock
862 # puts "Got $readCount bytes"
863 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
867 proc RobotReadHeader {task url sock} {
868 global URL debuglevel
870 if {$debuglevel > 1} {
871 puts "HTTP head $url"
873 if {[catch {set buffer [read $sock 2148]}]} {
874 RobotError $task $url 404
875 RobotRestart $task $url $sock
878 set readCount [string length $buffer]
880 if {$readCount <= 0} {
881 RobotError $task $url 404
882 RobotRestart $task $url $sock
884 # puts "Got $readCount bytes"
885 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
887 set n [string first \r\n\r\n $URL($task,$url,buf)]
891 set headbuf [string range $URL($task,$url,buf) 0 $n]
893 set URL($task,$url,charset) ISO-8859-1
894 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
896 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
897 set lines [split $headbuf \n]
898 foreach line $lines {
899 if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
900 set URL($task,$url,head,[string tolower $name]) [string trim $value]
902 regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset)
904 puts "HTTP CODE $code"
905 set URL($task,$url,state) skip
908 RobotRedirect $task $url $URL($task,$url,head,location) 301
909 RobotRestart $task $url $sock
912 RobotRedirect $task $url $URL($task,$url,head,location) 302
913 RobotRestart $task $url $sock
916 if {![info exists URL($task,$url,head,content-type)]} {
917 set URL($task,$url,head,content-type) {}
920 switch -glob -- $URL($task,$url,head,content-type) {
925 if {![regexp {/robots.txt$} $url]} {
926 if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
927 RobotError $task $url mimedeny
928 RobotRestart $task $url $sock
932 fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
935 RobotError $task $url $code
936 RobotRestart $task $url $sock
943 proc RobotSockCancel {task url sock} {
945 puts "RobotSockCancel sock=$sock url=$url"
946 RobotError $task $url 401
947 RobotRestart $task $url $sock
950 proc RobotConnect {task url sock} {
951 global URL agent acceptLanguage
953 fconfigure $sock -translation {lf crlf} -blocking 0
954 fileevent $sock readable [list RobotReadHeader $task $url $sock]
955 puts $sock "GET $URL($task,$url,path) HTTP/1.0"
956 puts $sock "Host: $URL($task,$url,host)"
957 puts $sock "User-Agent: $agent"
958 if {[string length $acceptLanguage]} {
959 puts $sock "Accept-Language: $acceptLanguage"
962 set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
963 if {[catch {flush $sock}]} {
964 RobotError $task $url 404
965 RobotRestart $task $url $sock
973 proc RobotGetUrl {task url phost} {
974 global URL robotsRunning
976 puts "Retrieve running=$robotsRunning url=$url task=$task"
977 if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
980 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
984 set URL($task,$url,method) $method
985 set URL($task,$url,host) $host
986 set URL($task,$url,hostport) $hostport
987 set URL($task,$url,path) $path
988 set URL($task,$url,state) head
989 set URL($task,$url,buf) {}
991 if {[string compare $path /robots.txt]} {
993 if {![info exists URL($hostport,robots)]} {
994 puts "READING robots.txt for host $hostport"
995 if {[RobotFileExist $task visited $hostport /robots.txt]} {
996 set inf [RobotFileOpen $task visited $hostport /robots.txt r]
997 set buf [read $inf 32768]
1000 set buf "User-agent: *\nAllow: /\n"
1002 RobotsTxt0 $task URL($hostport,robots) $buf
1004 if {[info exists URL($hostport,robots)]} {
1005 foreach l $URL($hostport,robots) {
1006 if {[string first [lindex $l 1] $path] == 0} {
1007 set ok [lindex $l 0]
1013 puts "skipped due to robots.txt"
1017 if [catch {set sock [socket -async $host $port]}] {
1020 RobotConnect $task $url $sock
1025 if {![llength [info commands htmlSwitch]]} {
1026 set e [info sharedlibextension]
1027 if {[catch {load ./tclrobot$e}]} {
1032 set agent "zmbot/0.2"
1033 if {![catch {set os [exec uname -s -r]}]} {
1034 set agent "$agent ($os)"
1037 puts "agent: $agent"
1048 set acceptLanguage {}
1051 # Rules: allow, deny, url
1053 proc checkrule {task type this} {
1059 if {$debuglevel > 3} {
1060 puts "CHECKRULE $type $this"
1062 if {[info exist control($task,alrules)]} {
1063 foreach l $control($task,alrules) {
1064 if {$debuglevel > 3} {
1068 if {[lindex $l 1] != $type} continue
1069 # consider mask (! negates)
1070 set masks [lindex $l 2]
1073 foreach mask $masks {
1074 if {$debuglevel > 4} {
1075 puts "consider single mask $mask"
1077 if {[string index $mask 0] == "!"} {
1078 set mask [string range $mask 1 end]
1079 if {[string match $mask $this]} continue
1081 if {![string match $mask $this]} continue
1085 if {$debuglevel > 4} {
1089 # OK, we have a match
1090 if {[lindex $l 0] == "allow"} {
1091 if {$debuglevel > 3} {
1092 puts "CHECKRULE MATCH OK"
1096 if {$debuglevel > 3} {
1097 puts "CHECKFULE MATCH FAIL"
1103 if {$debuglevel > 3} {
1104 puts "CHECKRULE MATCH DEFAULT $default_ret"
1111 global debuglevel task
1113 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1114 if {![RobotFileExist $task visited $host $path]} {
1115 set outf [RobotFileOpen $task unvisited $host $path]
1116 RobotWriteRecord $outf href 0
1117 RobotFileClose $outf
1122 proc deny {type stuff} {
1125 lappend control($task,alrules) [list deny $type $stuff]
1128 proc allow {type stuff} {
1131 lappend control($task,alrules) [list allow $type $stuff]
1134 proc debug {level} {
1137 set debuglevel $level
1141 global tasks task status control
1145 if {[info exists tasks]} {
1146 if {[lsearch -exact $tasks $t] >= 0} {
1152 set status($t,unvisited) 0
1153 set status($t,visited) 0
1154 set status($t,bad) 0
1155 set status($t,raw) 0
1156 set status($t,active) 1
1157 set control($t,seq) 0
1158 set control($t,distance) 10
1162 # Little utility that ensures that at least one task is present (main).
1165 if {![info exist tasks]} {
1174 set l [llength $argv]
1177 puts {tclrobot: usage:}
1178 puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-D dir] [-r rules] [url ..]}
1179 puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
1186 set arg [lindex $argv $i]
1187 switch -glob -- $arg {
1189 set t [string range $arg 2 end]
1190 if {![string length $t]} {
1191 set t [lindex $argv [incr i]]
1196 set dir [string range $arg 2 end]
1197 if {![string length $dir]} {
1198 set dir [lindex $argv [incr i]]
1200 lappend daemon_dir $dir
1203 set robotsMax [string range $arg 2 end]
1204 if {![string length $robotsMax]} {
1205 set robotsMax [lindex $argv [incr i]]
1210 set control($task,distance) [string range $arg 2 end]
1211 if {![string length $control($task,distance)]} {
1212 set control($task,distance) [lindex $argv [incr i]]
1217 set dom [string range $arg 2 end]
1218 if {![string length $dom]} {
1219 set dom [lindex $argv [incr i]]
1221 lappend control($task,domains) $dom
1224 set idletime [string range $arg 2 end]
1225 if {![string length $idletime]} {
1226 set idletime [lindex $argv [incr i]]
1231 set acceptLanguage [string range $arg 2 end]
1232 if {![string length $acceptLanguage]} {
1233 set acceptLanguage [lindex $argv [incr i]]
1238 set rfile [string range $arg 2 end]
1239 if {![string length $rfile]} {
1240 set rfile [lindex $argv [incr i]]
1242 catch {unset maxdistance}
1244 if {[info exists maxdistance]} {
1245 set control($task,distance) $maxdistance
1251 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1252 if {![RobotFileExist $task visited $host $path]} {
1253 set outf [RobotFileOpen $task unvisited $host $path]
1254 RobotWriteRecord $outf href 0
1255 RobotFileClose $outf
1263 if {![info exist robotsMax]} {
1267 if {[info exist daemon_dir]} {
1272 puts "max distance=$control($t,distance)"
1273 if {[info exists control($t,domains)]} {
1274 puts "domains=$control($t,domains)"
1277 puts "max jobs=$robotsMax"
1283 while {$robotsRunning} {
1287 if {[info exists tasks]} {
1289 set statusfile [open $t/status w]
1290 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"