2 # $Id: robot.tcl,v 1.45 2003/06/11 10:29:41 adam Exp $
4 proc RobotFileNext1 {area lead} {
5 # puts "RobotFileNext1 area=$area lead=$lead"
6 if {[catch {set ns [glob ${area}/*]}]} {
10 if {[file isfile $n]} {
11 set off [string last / $n]
14 set end [string length $n]
17 return $lead/[string range $n $off $end]
21 if {[file isdirectory $n]} {
22 set off [string last / $n]
25 set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
26 if {[string length $sb]} {
34 proc RobotWriteRecord {outf fromurl distance} {
35 puts $outf {<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>}
37 puts $outf "<distance>"
39 puts $outf "</distance>"
40 puts $outf "<fromurl>"
42 puts $outf "</fromurl>"
46 proc RobotReadRecord {inf fromurlx distancex} {
47 upvar $fromurlx fromurl
48 upvar $distancex distance
52 set distance [string trim [gets $inf]]
53 # puts "got distance = $distance"
56 set fromurl [string trim [gets $inf]]
59 proc RobotFileNext {task area} {
64 # puts "RobotFileNext seq=$control($task,seq)"
65 if {$control($task,seq) < 0} {
68 if {$control($task,seq) == 0} {
69 if {[catch {set ns($task) [glob $task/$area/*]}]} {
73 # puts "ns=$ns($task)"
74 set off [string length $task/$area]
76 set n [lindex $ns($task) $control($task,seq)]
78 if {![string length $n]} {
79 set control($task,seq) -1
81 set statusfile [open $task/status w]
82 puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
86 incr control($task,seq)
87 if {[file isfile $n/robots.txt_.tkl]} {
88 # puts "ok returning http://[string range $n $off end]/robots.txt"
89 return http://[string range $n $off end]/robots.txt
90 } elseif {[file isdirectory $n]} {
91 set sb [RobotFileNext1 $n http://[string range $n $off end]]
92 if {[string length $sb]} {
96 puts "no more work at end of RobotFileNext n=$n"
102 proc RobotFileExist {task area host path} {
105 if {$debuglevel > 3} {
106 puts "RobotFileExist begin area=$area host=$host path=$path"
108 return [file exists $task/$area/$host${path}_.tkl]
111 proc RobotFileUnlink {task area host path} {
113 # puts "RobotFileUnlink begin"
114 # puts "area=$area host=$host path=$path"
115 set npath $task/$area/$host${path}_.tkl
116 # puts "npath=$npath"
117 set comp [split $npath /]
118 if {[catch {exec rm $npath}]} return
120 set l [llength $comp]
122 incr status($task,$area) -1
123 for {set i $l} {$i > 0} {incr i -1} {
124 set path [join [lrange $comp 0 $i] /]
125 if {![catch {glob $path/*}]} return
128 # puts "RobotFileUnlink end"
131 proc RobotFileClose {out} {
132 if [string compare $out stdout] {
137 proc RobotFileOpen {task area host path {mode w}} {
143 # puts "RobotFileOpen task=$task path=$path"
145 if {![info exists workdir]} {
148 if {$debuglevel > 3} {
149 puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
151 if {[string compare $orgPwd $workdir]} {
152 puts "ooops. RobotFileOpen failed"
153 puts "workdir = $workdir"
158 set comp [split $task/$area/$host /]
159 set len [llength $comp]
162 # puts "1 comp=$comp"
164 for {set i 0} {$i <= $len} {incr i} {
165 set d [lindex $comp $i]
166 if {[string length $d] == 0} {
168 } elseif {[catch {cd $d}]} {
171 if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
172 if {[string compare $path /robots.txt]} {
173 set out [open robots.txt_.tkl w]
174 puts "creating robots.txt in $d"
176 incr status($task,unvisited)
182 set comp [split $path /]
183 set len [llength $comp]
186 # puts "2 path=$path comp=$comp"
188 for {set i 0} {$i < $len} {incr i} {
189 set d [lindex $comp $i]
190 if {[string length $d] > 0} {
191 if {[catch {cd $d}]} {
197 set d [lindex $comp $len]
198 set out [open ${d}_.tkl $mode]
200 incr status($task,$area)
206 proc RobotStartJob {fname t} {
209 set f [open $fname r]
211 puts "Reading $fname"
213 if {![regexp {<status>([^<]*)</status>} $xml x status]} {
216 if {$status == "done"} {
220 puts "status = $status"
229 set action $parm(action)
230 if {$type == "domain"} {
231 $action url http://$body/*
233 if {$type == "url"} {
236 if {$type == "mime"} {
240 set control($t,distance) $body
242 set control($t,filestatus) $body
244 if {$status == "pending"} {
245 regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
246 set f [open $fname w]
247 puts -nonewline $f $xml2
252 proc RobotDoneJob {t} {
255 if {![info exists daemon_dir]} {
261 set f [open $fname r]
263 puts "Reading $fname"
264 regexp {<status>([^<]*)</status>} $xml x status
266 puts "status = $status"
269 regsub {<status>[^<]*</status>} $xml {<status>done</status>} xml2
270 set f [open $fname w]
271 puts -nonewline $f $xml2
275 proc RobotScanDir {} {
278 if {![info exists daemon_dir]} {
281 foreach d $daemon_dir {
282 if {[catch {set files [glob $d/*.tkl]}]} {
285 foreach fname $files {
286 if {[file isfile $fname] && [file readable $fname]} {
287 set t [file rootname $fname]
288 RobotStartJob $fname $t
294 proc RobotRR {task} {
295 global control robotsRunning tasks robotsMax status
297 puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
298 incr robotsRunning -1
300 # only one task gets through...
301 if {[string compare [lindex $tasks 0] $task]} {
304 puts "RobotRR. task = $task"
305 while {$robotsRunning} {
309 if {[catch {RobotScanDir} msg]} {
310 puts "RobotScanDir failed"
314 set statusfile [open $t/status w]
315 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
317 set control($t,seq) 0
322 proc RobotDaemonSig {} {
328 proc RobotDaemonLoop {} {
329 global daemon_cnt tasks robotsRunning status
337 if {[info exists tasks]} {
338 puts "daemon loop tasks $tasks"
340 set control($t,seq) 0
343 while {$robotsRunning} {
347 after 30000 RobotDaemonSig
352 proc RobotRestart {task url sock} {
353 global URL robotsRunning
356 after cancel $URL($sock,cancel)
358 foreach v [array names URL $task,$url,*] {
362 incr robotsRunning -1
366 proc RobotStart {task} {
368 global robotsRunning robotsMax idletime status tasks
370 # puts "RobotStart $task running=$robotsRunning"
372 set url [RobotFileNext $task unvisited]
373 if {[string compare $url done] == 0} {
374 puts "In RobotStart task $task done"
378 if {[string compare $t $task]} {
384 if {![info exists ntasks]} {
393 if {![string length $url]} {
397 if {[string compare $url wait] == 0} {
398 after $idletime [list RobotRR $task]
401 set r [RobotGetUrl $task $url {}]
403 if {$robotsRunning >= $robotsMax} return
405 incr robotsRunning -1
406 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
407 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
410 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
415 proc headSave {task url out} {
418 if {[info exists URL($task,$url,head,last-modified)]} {
419 puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
422 if {[info exists URL($task,$url,head,date)]} {
423 puts $out " <date>$URL($task,$url,head,date)</date>"
425 if {[info exists URL($task,$url,head,content-length)]} {
426 puts $out " <by>$URL($task,$url,head,content-length)</by>"
428 if {[info exists URL($task,$url,head,server)]} {
429 puts $out " <format>$URL($task,$url,head,server)</format>"
432 puts $out {<publisher>}
433 puts $out " <identifier>$url</identifier>"
434 if {[info exists URL($task,$url,head,content-type)]} {
435 puts $out " <type>$URL($task,$url,head,content-type)</type>"
437 puts $out {</publisher>}
440 proc RobotHref {task url hrefx hostx pathx} {
441 global URL control debuglevel
446 if {$debuglevel > 1} {
447 puts "Ref input url = $url href=$href"
450 if {[string first { } $href] >= 0} {
453 if {[string length $href] > 256} {
457 # Skip pages that have ? in them
458 # if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
461 # get method (if any)
462 if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
466 if {[string compare $method http]} {
471 if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
472 if {![string length $surl]} {
475 if {[info exist control($task,domains)]} {
477 foreach domain $control($task,domains) {
478 if {[string match $domain $host]} {
488 regexp {^([^\#]*)} $hpath x surl
489 set host $URL($task,$url,hostport)
491 if {![string length $surl]} {
494 if {[string first / $surl]} {
496 set curpath $URL($task,$url,path)
497 if {[info exists URL($task,$url,bpath)]} {
498 set curpath $URL($task,$url,bpath)
500 regexp {^([^\#?]*)} $curpath x dpart
501 set l [string last / $dpart]
502 if {[expr $l >= 0]} {
503 set surl [string range $dpart 0 $l]$surl
505 set surl $dpart/$surl
508 set surllist [split $surl /]
511 foreach c $surllist {
516 set path [lrange $path 0 $pathl]
529 if {$debuglevel > 4} {
530 puts "pathl=$pathl output path=$path"
532 set path [join $path /]
533 if {![string length $path]} {
536 regsub -all {~} $path {%7E} path
537 set href "$method://$host$path"
539 if {$debuglevel > 1} {
540 puts "Ref result = $href"
542 return [checkrule $task url $href]
545 proc RobotError {task url code} {
548 puts "Bad URL $url (code $code)"
551 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
552 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
553 RobotReadRecord $inf fromurl distance
556 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
557 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
558 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
559 RobotWriteRecord $outf $fromurl $distance
564 proc RobotRedirect {task url tourl code} {
567 puts "Redirecting from $url to $tourl"
571 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
572 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
573 RobotReadRecord $inf fromurl distance
576 if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
577 set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
578 RobotWriteRecord $outf $fromurl $distance
581 if {[RobotHref $task $url tourl host path]} {
582 if {![RobotFileExist $task visited $host $path]} {
583 if {![RobotFileExist $task unvisited $host $path]} {
584 set outf [RobotFileOpen $task unvisited $host $path]
585 RobotWriteRecord $outf $fromurl $distance
590 set inf [RobotFileOpen $task visited $host $path r]
591 RobotReadRecord $inf oldurl olddistance
593 if {[string length $olddistance] == 0} {
596 if {[string length $distance] == 0} {
599 puts "distance=$distance olddistance=$olddistance"
600 if {[expr $distance < $olddistance]} {
601 set outf [RobotFileOpen $task unvisited $host $path]
602 RobotWriteRecord $outf $tourl $distance
607 if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
613 proc wellform {body} {
614 regsub -all {<!--[^-]*-->} $body { } abody
615 regsub -all -nocase {<script[^<]*</script>} $abody {} body
616 regsub -all {<[^\>]+>} $body {} abody
617 regsub -all { } $abody { } body
618 regsub -all {&} $body {&} abody
622 proc link {task url out href body distance} {
624 if {[expr $distance > $control($task,distance)]} return
626 if {![RobotHref $task $url href host path]} return
629 puts $out "<identifier>$href</identifier>"
630 set abody [wellform $body]
631 puts $out "<description>$abody</description>"
634 if {![RobotFileExist $task visited $host $path]} {
636 if {![RobotFileExist $task bad $host $path]} {
637 if {[RobotFileExist $task unvisited $host $path]} {
638 set inf [RobotFileOpen $task unvisited $host $path r]
639 RobotReadRecord $inf oldurl olddistance
645 if {[string length $olddistance] == 0} {
648 if {[expr $distance < $olddistance]} {
649 set outf [RobotFileOpen $task unvisited $host $path]
650 RobotWriteRecord $outf $url $distance
653 } elseif {[string compare $href $url]} {
654 set inf [RobotFileOpen $task visited $host $path r]
655 RobotReadRecord $inf xurl olddistance
657 if {[string length $olddistance] == 0} {
660 if {[expr $distance < $olddistance]} {
661 puts "OK remarking url=$url href=$href"
662 puts "olddistance = $olddistance"
663 puts "newdistance = $distance"
664 set outf [RobotFileOpen $task unvisited $host $path]
665 RobotWriteRecord $outf $url $distance
671 proc RobotTextHtml {task url out} {
674 # set title so we can emit it for the body
676 # if true, nothing will be indexed
678 # if true, nothing will be followed
683 if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
684 set fdistance $URL($task,$url,dist)
685 set distance [expr $fdistance + 1]
687 htmlSwitch $URL($task,$url,buf) \
691 # collect metadata and save NAME= CONTENT=..
694 puts -nonewline $out "<meta"
695 set al [array names parm]
697 set al [string tolower $a]
698 puts -nonewline $out " $al"
699 puts -nonewline $out {="}
700 puts -nonewline $out $parm($a)
701 puts -nonewline $out {"}
704 set metaname [string tolower $parm($a)]
707 set metacontent $parm($a)
713 # go through robots directives (af any)
714 if {![string compare $metaname robots]} {
715 set direcs [split [string tolower $metacontent] ,]
716 if {[lsearch $direcs noindex] >= 0} {
719 if {[lsearch $direcs nofollow] >= 0} {
724 # don't print title of document content if noindex is used
726 puts $out "<title>$title</title>"
727 set bbody [wellform $body]
728 puts $out "<documentcontent>"
730 puts $out "</documentcontent>"
734 if {![info exists parm(href)]} {
737 set href [string trim $parm(href)]
738 if {![RobotHref $task $url href host path]} continue
739 set URL($task,$url,bpath) $path
741 # <a href="...."> .. </a>
742 # we're not using nonest - otherwise body isn't set
743 if {$nofollow} continue
744 if {![info exists parm(href)]} {
747 link $task $url $out [string trim $parm(href)] $body $distance
749 if {$nofollow} continue
750 if {![info exists parm(href)]} {
753 link $task $url $out [string trim $parm(href)] $body $distance
755 if {![info exists parm(src)]} {
758 link $task $url $out [string trim $parm(src)] $body $fdistance
762 proc RobotsTxt {task url} {
765 RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf)
768 proc RobotsTxt0 {task v buf} {
771 foreach l [split $buf \n] {
772 if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
773 set arg [string trim $arg]
774 puts "cmd=$cmd arg=$arg"
775 switch -- [string tolower $cmd] {
778 set pat [string tolower $arg]*
779 set section [string match $pat $agent]
783 puts "rule [list 0 $arg]"
784 lappend $v [list 0 $arg]
789 puts "rule [list 1 $arg]"
790 lappend $v [list 1 $arg]
798 proc RobotTextPlain {task url out} {
801 puts $out "<documentcontent>"
802 regsub -all {<} $URL($task,$url,buf) {\<} content
804 puts $out "</documentcontent>"
806 if {![string compare $URL($task,$url,path) /robots.txt]} {
811 proc RobotWriteMetadata {task url out} {
814 set charset $URL($task,$url,charset)
815 puts $out "<?xml version=\"1.0\" encoding=\"$charset\" standalone=\"yes\"?>"
819 if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
820 set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
821 RobotReadRecord $inf fromurl distance
824 set URL($task,$url,dist) $distance
825 puts $out "<distance>"
826 puts $out " $distance"
827 puts $out "</distance>"
828 headSave $task $url $out
829 puts "Parsing $url distance=$distance"
830 switch $URL($task,$url,head,content-type) {
832 if {[string length $distance]} {
833 RobotTextHtml $task $url $out
837 RobotTextPlain $task $url $out
843 proc Robot200 {task url} {
846 set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
847 puts -nonewline $out $URL($task,$url,buf)
850 set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
851 RobotWriteMetadata $task $url $out
854 RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
857 proc RobotReadContent {task url sock binary} {
860 set buffer [read $sock 16384]
861 set readCount [string length $buffer]
863 if {$readCount <= 0} {
865 RobotRestart $task $url $sock
866 } elseif {!$binary && [string first \0 $buffer] >= 0} {
868 RobotRestart $task $url $sock
870 # puts "Got $readCount bytes"
871 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
875 proc RobotReadHeader {task url sock} {
876 global URL debuglevel
878 if {$debuglevel > 1} {
879 puts "HTTP head $url"
881 if {[catch {set buffer [read $sock 2148]}]} {
882 RobotError $task $url 404
883 RobotRestart $task $url $sock
886 set readCount [string length $buffer]
888 if {$readCount <= 0} {
889 RobotError $task $url 404
890 RobotRestart $task $url $sock
892 # puts "Got $readCount bytes"
893 set URL($task,$url,buf) $URL($task,$url,buf)$buffer
895 set n [string first \r\n\r\n $URL($task,$url,buf)]
899 set headbuf [string range $URL($task,$url,buf) 0 $n]
901 set URL($task,$url,charset) ISO-8859-1
902 set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
904 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
905 set lines [split $headbuf \n]
906 foreach line $lines {
907 if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
908 set URL($task,$url,head,[string tolower $name]) [string trim $value]
910 regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset)
912 puts "HTTP CODE $code"
913 set URL($task,$url,state) skip
916 RobotRedirect $task $url $URL($task,$url,head,location) 301
917 RobotRestart $task $url $sock
920 RobotRedirect $task $url $URL($task,$url,head,location) 302
921 RobotRestart $task $url $sock
924 if {![info exists URL($task,$url,head,content-type)]} {
925 set URL($task,$url,head,content-type) {}
928 switch -glob -- $URL($task,$url,head,content-type) {
933 if {![regexp {/robots.txt$} $url]} {
934 if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
935 RobotError $task $url mimedeny
936 RobotRestart $task $url $sock
940 fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
943 RobotError $task $url $code
944 RobotRestart $task $url $sock
951 proc RobotSockCancel {task url sock} {
953 puts "RobotSockCancel sock=$sock url=$url"
954 RobotError $task $url 401
955 RobotRestart $task $url $sock
958 proc RobotConnect {task url sock} {
959 global URL agent acceptLanguage
961 fconfigure $sock -translation {lf crlf} -blocking 0
962 fileevent $sock readable [list RobotReadHeader $task $url $sock]
963 puts $sock "GET $URL($task,$url,path) HTTP/1.0"
964 puts $sock "Host: $URL($task,$url,host)"
965 puts $sock "User-Agent: $agent"
966 if {[string length $acceptLanguage]} {
967 puts $sock "Accept-Language: $acceptLanguage"
970 set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
971 if {[catch {flush $sock}]} {
972 RobotError $task $url 404
973 RobotRestart $task $url $sock
981 proc RobotGetUrl {task url phost} {
982 global URL robotsRunning
984 puts "Retrieve running=$robotsRunning url=$url task=$task"
985 if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
988 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
992 set URL($task,$url,method) $method
993 set URL($task,$url,host) $host
994 set URL($task,$url,hostport) $hostport
995 set URL($task,$url,path) $path
996 set URL($task,$url,state) head
997 set URL($task,$url,buf) {}
999 if {[string compare $path /robots.txt]} {
1001 if {![info exists URL($hostport,robots)]} {
1002 puts "READING robots.txt for host $hostport"
1003 if {[RobotFileExist $task visited $hostport /robots.txt]} {
1004 set inf [RobotFileOpen $task visited $hostport /robots.txt r]
1005 set buf [read $inf 32768]
1008 set buf "User-agent: *\nAllow: /\n"
1010 RobotsTxt0 $task URL($hostport,robots) $buf
1012 if {[info exists URL($hostport,robots)]} {
1013 foreach l $URL($hostport,robots) {
1014 if {[string first [lindex $l 1] $path] == 0} {
1015 set ok [lindex $l 0]
1021 puts "skipped due to robots.txt"
1025 if [catch {set sock [socket -async $host $port]}] {
1028 RobotConnect $task $url $sock
1033 if {![llength [info commands htmlSwitch]]} {
1034 set e [info sharedlibextension]
1035 if {[catch {load ./tclrobot$e}]} {
1040 set agent "zmbot/0.2"
1041 if {![catch {set os [exec uname -s -r]}]} {
1042 set agent "$agent ($os)"
1045 puts "agent: $agent"
1056 set acceptLanguage {}
1059 # Rules: allow, deny, url
1061 proc checkrule {task type this} {
1067 if {$debuglevel > 3} {
1068 puts "CHECKRULE $type $this"
1070 if {[info exist control($task,alrules)]} {
1071 foreach l $control($task,alrules) {
1072 if {$debuglevel > 3} {
1076 if {[lindex $l 1] != $type} continue
1077 # consider mask (! negates)
1078 set masks [lindex $l 2]
1081 foreach mask $masks {
1082 if {$debuglevel > 4} {
1083 puts "consider single mask $mask"
1085 if {[string index $mask 0] == "!"} {
1086 set mask [string range $mask 1 end]
1087 if {[string match $mask $this]} continue
1089 if {![string match $mask $this]} continue
1093 if {$debuglevel > 4} {
1097 # OK, we have a match
1098 if {[lindex $l 0] == "allow"} {
1099 if {$debuglevel > 3} {
1100 puts "CHECKRULE MATCH OK"
1104 if {$debuglevel > 3} {
1105 puts "CHECKFULE MATCH FAIL"
1111 if {$debuglevel > 3} {
1112 puts "CHECKRULE MATCH DEFAULT $default_ret"
1119 global debuglevel task
1121 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1122 if {![RobotFileExist $task visited $host $path]} {
1123 set outf [RobotFileOpen $task unvisited $host $path]
1124 RobotWriteRecord $outf href 0
1125 RobotFileClose $outf
1130 proc deny {type stuff} {
1133 lappend control($task,alrules) [list deny $type $stuff]
1136 proc allow {type stuff} {
1139 lappend control($task,alrules) [list allow $type $stuff]
1142 proc debug {level} {
1145 set debuglevel $level
1149 global tasks task status control
1153 if {[info exists tasks]} {
1154 if {[lsearch -exact $tasks $t] >= 0} {
1160 set status($t,unvisited) 0
1161 set status($t,visited) 0
1162 set status($t,bad) 0
1163 set status($t,raw) 0
1164 set status($t,active) 1
1165 set control($t,seq) 0
1166 set control($t,distance) 10
1170 # Little utility that ensures that at least one task is present (main).
1173 if {![info exist tasks]} {
1182 set l [llength $argv]
1185 puts {tclrobot: usage:}
1186 puts {tclrobot [-j jobs] [-i idle] [-c count] [-d domain] [-D dir] [-r rules] [url ..]}
1187 puts " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
1194 set arg [lindex $argv $i]
1195 switch -glob -- $arg {
1197 set t [string range $arg 2 end]
1198 if {![string length $t]} {
1199 set t [lindex $argv [incr i]]
1204 set dir [string range $arg 2 end]
1205 if {![string length $dir]} {
1206 set dir [lindex $argv [incr i]]
1208 lappend daemon_dir $dir
1211 set robotsMax [string range $arg 2 end]
1212 if {![string length $robotsMax]} {
1213 set robotsMax [lindex $argv [incr i]]
1218 set control($task,distance) [string range $arg 2 end]
1219 if {![string length $control($task,distance)]} {
1220 set control($task,distance) [lindex $argv [incr i]]
1225 set dom [string range $arg 2 end]
1226 if {![string length $dom]} {
1227 set dom [lindex $argv [incr i]]
1229 lappend control($task,domains) $dom
1232 set idletime [string range $arg 2 end]
1233 if {![string length $idletime]} {
1234 set idletime [lindex $argv [incr i]]
1239 set acceptLanguage [string range $arg 2 end]
1240 if {![string length $acceptLanguage]} {
1241 set acceptLanguage [lindex $argv [incr i]]
1246 set rfile [string range $arg 2 end]
1247 if {![string length $rfile]} {
1248 set rfile [lindex $argv [incr i]]
1250 catch {unset maxdistance}
1252 if {[info exists maxdistance]} {
1253 set control($task,distance) $maxdistance
1259 if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
1260 if {![RobotFileExist $task visited $host $path]} {
1261 set outf [RobotFileOpen $task unvisited $host $path]
1262 RobotWriteRecord $outf href 0
1263 RobotFileClose $outf
1271 if {![info exist robotsMax]} {
1275 if {[info exist daemon_dir]} {
1280 puts "max distance=$control($t,distance)"
1281 if {[info exists control($t,domains)]} {
1282 puts "domains=$control($t,domains)"
1285 puts "max jobs=$robotsMax"
1291 while {$robotsRunning} {
1295 if {[info exists tasks]} {
1297 set statusfile [open $t/status w]
1298 puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"