1 # $Id: Utils.pm,v 1.36 2007-09-20 10:12:19 mike Exp $
3 package ZOOM::IRSpy::Utils;
10 our @EXPORT_OK = qw(utf8param
17 irspy_record2identifier
18 irspy_identifier2target
24 use XML::LibXML::XPathContext;
26 use Encode qw(is_utf8);
29 our $IRSPY_NS = 'http://indexdata.com/irspy/1.0';
32 # Utility functions follow, exported for use of web UI
34 my($r, $key, $value) = @_;
35 die "utf8param() called with value '$value'" if defined $value;
37 my $raw = $r->param($key);
38 return undef if !defined $raw;
39 my $cooked = decode_utf8($raw);
40 warn "converted '$raw' to '", $cooked, "'\n" if $cooked ne $raw;
48 my($sec, $min, $hour, $mday, $mon, $year) = localtime($time);
49 return sprintf("%04d-%02d-%02dT%02d:%02d:%02d",
50 $year+1900, $mon+1, $mday, $hour, $min, $sec);
54 # I can't -- just can't, can't, can't -- believe that this function
55 # isn't provided by one of the core XML modules. But the evidence all
56 # says that it's not: among other things, XML::Generator and
57 # Template::Plugin both roll their own. So I will do likewise. D'oh!
60 my($text, $fallback, $opts) = @_;
61 if (!defined $opts && ref $fallback) {
62 # The second and third arguments are both optional
66 $opts = {} if !defined $opts;
68 $text = $fallback if !defined $text;
70 confess "xml_encode(): text and fallback both undefined"
76 # Internet Explorer can't display ' (!) so don't create it
77 #$text =~ s/['']/'/g;
78 $text =~ s/[""]/"/g;
79 $text =~ s/ / /g if $opts->{nbsp};
85 # Quotes a term for use in a CQL query
89 $term =~ s/([""\\*?])/\\$1/g;
90 $term = qq["$term"] if $term =~ /[\s""\/]/;
95 # Makes a CQL query that finds a specified target. Arguments may be
96 # either an ID alone, or a (host, port, db) triple.
98 my($protocol, $host, $port, $db) = @_;
102 $id = irspy_make_identifier($protocol, $host, $port, $db);
107 return "rec.id=" . cql_quote($id);
111 # PRIVATE to irspy_namespace() and irspy_xpath_context()
113 e => 'http://explain.z3950.org/dtd/2.0/',
118 sub irspy_namespace {
122 confess "irspy_namespace(undef)" if !defined $prefix;
123 my $uri = $_namespaces{$prefix};
124 die "irspy_namespace(): no URI for namespace prefix '$prefix'"
131 sub irspy_xpath_context {
134 if (ref $record && $record->isa("ZOOM::Record")) {
135 $record = $record->render();
142 my $parser = new XML::LibXML();
143 my $doc = $parser->parse_string($record);
144 $root = $doc->getDocumentElement();
147 my $xc = XML::LibXML::XPathContext->new($root);
148 foreach my $prefix (keys %_namespaces) {
149 $xc->registerNs($prefix, $_namespaces{$prefix});
155 # Construct an opaque identifier from its components. Although it's
156 # trivial, this is needed in so many places that it really needs to be
159 # This is the converse of _parse_target_string() in IRSpy.pm, which
160 # should be renamed and moved into this package.
162 sub irspy_make_identifier {
163 my($protocol, $host, $port, $dbname) = @_;
165 die "irspy_make_identifier(" . join(", ", map { "'$_'" } @_).
166 "): wrong number of arguments" if @_ != 4;
168 die "irspy_make_identifier(): protocol undefined" if !defined $protocol;
169 die "irspy_make_identifier(): host undefined" if !defined $host;
170 die "irspy_make_identifier(): port undefined" if !defined $port;
171 die "irspy_make_identifier(): dbname undefined" if !defined $dbname;
173 return "$protocol:$host:$port/$dbname";
177 # Returns the opaque identifier of an IRSpy record based on the
178 # XPathContext'ed DOM object, as returned by irspy_xpath_context().
179 # This is doing the same thing as irspy_make_identifier() but from a
180 # record rather than a set of parameters.
182 sub irspy_record2identifier {
185 ### Must be kept the same as is used in ../../../zebra/*.xsl
186 return $xc->find("concat(e:serverInfo/\@protocol, ':',
187 e:serverInfo/e:host, ':',
188 e:serverInfo/e:port, '/',
189 e:serverInfo/e:database)");
193 # Transforms an IRSpy opqaue identifier, as returned from
194 # irspy_make_identifier() or irspy_record2identifier(), into a YAZ
195 # target-string suitable for feeding to ZOOM. Before we introduced
196 # the protocol element at the start of the identifier string, this was
197 # a null transform; now we have to be a bit cleverer.
199 sub irspy_identifier2target {
200 my $res = _irspy_identifier2target(@_);
201 #carp "converted ID '@_' to target '$res'";
205 sub _irspy_identifier2target {
208 confess "_irspy_identifier2target(): id is undefined"
211 my($protocol, $target) = ($id =~ /(.*?):(.*)/);
212 if (uc($protocol) eq "Z39.50") {
213 return "tcp:$target";
214 } elsif (uc($protocol) eq "SRU") {
215 return "sru=get,http:$target";
216 } elsif (uc($protocol) eq "SRW") {
217 return "sru=srw,http:$target";
220 warn "unrecognised protocol '$protocol' in ID $id";
225 sub modify_xml_document {
226 my($xc, $fieldsByKey, $data) = @_;
229 foreach my $key (keys %$data) {
230 my $value = $data->{$key};
231 my $ref = $fieldsByKey->{$key} or die "no field '$key'";
232 my($name, $nlines, $caption, $xpath, @addAfter) = @$ref;
233 #print "Considering $key='$value' ($xpath)<br/>\n";
234 my @nodes = $xc->findnodes($xpath);
236 warn scalar(@nodes), " nodes match '$xpath'" if @nodes > 1;
237 my $node = $nodes[0];
239 if ($node->isa("XML::LibXML::Attr")) {
240 if ($value ne $node->getValue()) {
241 $node->setValue($value);
243 #print "Attr $key: '", $node->getValue(), "' -> '$value' ($xpath)<br/>\n";
245 } elsif ($node->isa("XML::LibXML::Element")) {
246 # The contents could be any mixture of text and
247 # comments and maybe even other crud such as processing
248 # instructions. The simplest thing is just to throw it all
249 # away and start again, making a single Text node the
250 # canonical representation. But before we do that,
251 # we'll check whether the element is already
252 # canonical, to determine whether our change is a
255 my @children = $node->childNodes();
256 if (@children == 1) {
257 my $child = $node->firstChild();
258 if (ref $child && ref $child eq "XML::LibXML::Text") {
259 $old = $child->getData();
260 #print STDERR "child='$child', old=", _renderchars($old), "\n" if $key eq "title";
263 next if $value eq $old;
265 $node->removeChildNodes();
266 my $child = new XML::LibXML::Text($value);
267 $node->appendChild($child);
269 #print STDERR "Elem $key ($xpath): ", _renderchars($old), " -> '", _renderchars($value), "\n";
271 warn "unexpected node type $node";
275 next if !$value; # No need to create a new empty node
276 my($ppath, $selector) = $xpath =~ /(.*)\/(.*)/;
277 dom_add_node($xc, $ppath, $selector, $value, @addAfter);
278 #print "New $key ($xpath) = '$value'<br/>\n";
290 return "'" . $text . "'", " (", join(" ", map {ord($_)} split //, $text), "), is_utf8=" , is_utf8($text);
295 my($xc, $ppath, $selector, $value, @addAfter) = @_;
297 #print "Adding $selector='$value' at '$ppath' after (", join(", ", map { "'$_'" } @addAfter), ")<br/>\n";
298 my $node = find_or_make_node($xc, $ppath, 0);
299 die "couldn't find or make node '$node'" if !defined $node;
301 my $is_attr = ($selector =~ s/^@//);
302 my(undef, $prefix, $simpleSel) = $selector =~ /((.*?):)?(.*)/;
303 #warn "selector='$selector', prefix='$prefix', simpleSel='$simpleSel'";
305 if (defined $prefix) {
306 ### This seems to no-op (thank, DOM!) but I have have no
307 # idea, and it's not needed for IRSpy, so I am not going
309 $node->setAttributeNS(irspy_namespace($prefix),
312 $node->setAttribute($simpleSel, $value);
317 my $new = new XML::LibXML::Element($simpleSel);
318 $new->setNamespace(irspy_namespace($prefix), $prefix)
321 $new->appendText($value);
322 foreach my $predecessor (reverse @addAfter) {
323 my($child) = $xc->findnodes($predecessor, $node);
324 if (defined $child) {
325 $node->insertAfter($new, $child);
326 #warn "Added after '$predecessor'";
331 # Didn't find any of the nodes that are supposed to precede the
332 # new one, so we need to insert the new node as the first of the
333 # parent's children. However *sigh* there is no prependChild()
334 # analogous to appendChild(), so we have to go the long way round.
335 my @children = $node->childNodes();
337 $node->insertBefore($new, $children[0]);
338 #warn "Added new first child";
340 $node->appendChild($new);
341 #warn "Added new only child";
345 my $text = xml_encode(inheritance_tree($xc));
346 $text =~ s/\n/<br\/>$&/sg;
347 print "<pre>$text</pre>\n";
352 sub find_or_make_node {
353 my($xc, $path, $recursion_level) = @_;
355 die "deep recursion in find_or_make_node($path)"
356 if $recursion_level == 10;
357 $path = "." if $path eq "";
359 my @nodes = $xc->findnodes($path);
361 # Oh dear, the parent node doesn't exist. We could make it,
362 my(undef, $ppath, $element) = $path =~ /((.*)\/)?(.*)/;
363 $ppath = "" if !defined $ppath;
364 #warn "path='$path', ppath='$ppath', element='$element'";
365 #warn "no node '$path': making it";
366 my $parent = find_or_make_node($xc, $ppath, $recursion_level-1);
368 my(undef, $prefix, $nsElem) = $element =~ /((.*?):)?(.*)/;
369 #warn "element='$element', prefix='$prefix', nsElem='$nsElem'";
370 my $new = new XML::LibXML::Element($nsElem);
371 if (defined $prefix) {
372 #warn "setNamespace($prefix)";
373 $new->setNamespace(irspy_namespace($prefix), $prefix);
376 $parent->appendChild($new);
379 warn scalar(@nodes), " nodes match parent '$path'" if @nodes > 1;
384 sub inheritance_tree {
385 my($type, $level) = @_;
386 $level = 0 if !defined $level;
387 return "Woah! Too deep, man!\n" if $level > 20;
389 $type = ref $type if ref $type;
391 $text = "--> " if $level == 0;
392 $text .= ("\t" x $level) . "$type\n";
393 my @ISA = eval "\@${type}::ISA";
394 foreach my $superclass (@ISA) {
395 $text .= inheritance_tree($superclass, $level+1);
402 # This function is made available in xslt using the register_function call
404 my ($arg1, $arg2) = @_;
405 return "$arg1" cmp "$arg2";
409 ### It feels like this should be in YAZ, exported via ZOOM-Perl.
410 my %_bib1_access_point = (
411 1 => "Personal name",
412 2 => "Corporate name",
413 3 => "Conference name",
416 6 => "Title uniform",
419 9 => "LC card number",
420 10 => "BNB card no.",
422 12 => "Local number",
423 13 => "Dewey classification",
424 14 => "UDC classification",
425 15 => "Bliss classification",
426 16 => "LC call number",
427 17 => "NLM call number",
428 18 => "NAL call number",
429 19 => "MOS call number",
430 20 => "Local classification",
431 21 => "Subject heading",
432 22 => "Subject Rameau",
433 23 => "BDI index subject",
434 24 => "INSPEC subject",
435 25 => "MESH subject",
437 27 => "LC subject heading",
438 28 => "RVM subject heading",
439 29 => "Local subject index",
441 31 => "Date of publication",
442 32 => "Date of acquisition",
444 34 => "Title collective",
445 35 => "Title parallel",
447 37 => "Title added title page",
448 38 => "Title caption",
449 39 => "Title running",
451 41 => "Title other variant",
452 42 => "Title former",
453 43 => "Title abbreviated",
454 44 => "Title expanded",
455 45 => "Subject precis",
456 46 => "Subject rswk",
457 47 => "Subject subdivision",
458 48 => "No. nat'l biblio.",
459 49 => "No. legal deposit",
460 50 => "No. govt pub.",
461 51 => "No. music publisher",
463 53 => "Number local call",
464 54 => "Code--language",
465 55 => "Code--geographic area",
466 56 => "Code--institution",
467 57 => "Name and title *",
468 58 => "Name geographic",
469 59 => "Place publication",
471 61 => "Microform generation",
474 1000 => "Author-title",
475 1001 => "Record type",
478 1004 => "Author-name personal",
479 1005 => "Author-name corporate",
480 1006 => "Author-name conference",
481 1007 => "Identifier--standard",
482 1008 => "Subject--LC children's",
483 1009 => "Subject name -- personal",
484 1010 => "Body of text",
485 1011 => "Date/time added to db",
486 1012 => "Date/time last modified",
487 1013 => "Authority/format id",
488 1014 => "Concept-text",
489 1015 => "Concept-reference",
491 1017 => "Server-choice",
493 1019 => "Record-source",
496 1022 => "Geographic-class",
497 1023 => "Indexed-by",
500 1026 => "Related-periodical",
501 1027 => "Report-number",
502 1028 => "Stock-number",
503 1030 => "Thematic-number",
504 1031 => "Material-type",
507 1034 => "Content-type",
509 1036 => "Author-Title-Subject",
510 1032 => "Doc-id (semantic definition change)",
512 1038 => "Abstract-language",
513 1039 => "Application-kind",
514 1040 => "Classification",
515 1041 => "Classification-basic",
516 1042 => "Classification-local-record",
518 1044 => "Possessing-institution",
519 1045 => "Record-linking",
520 1046 => "Record-status",
522 1048 => "Control-number-GKD",
523 1049 => "Control-number-linking",
524 1050 => "Control-number-PND",
525 1051 => "Control-number-SWD",
526 1052 => "Control-number-ZDB",
527 1053 => "Country-publication (country of Publication)",
528 1054 => "Date-conference (meeting date)",
529 1055 => "Date-record-status",
530 1056 => "Dissertation-information",
531 1057 => "Meeting-organizer",
532 1058 => "Note-availability",
533 1059 => "Number-CAS-registry (CAS registry number)",
534 1060 => "Number-document (document number)",
535 1061 => "Number-local-accounting",
536 1062 => "Number-local-acquisition",
537 1063 => "Number-local-call-copy-specific",
538 1064 => "Number-of-reference (reference count)",
539 1065 => "Number-norm",
540 1066 => "Number-volume",
541 1067 => "Place-conference (meeting location)",
542 1068 => "Reference (references and footnotes)",
543 1069 => "Referenced-journal (reference work)",
544 1070 => "Section-code",
545 1071 => "Section-heading",
546 1072 => "Subject-GOO",
547 1073 => "Subject-name-conference",
548 1074 => "Subject-name-corporate",
549 1075 => "Subject-genre/form",
550 1076 => "Subject-name-geographical",
551 1077 => "Subject--chronological",
552 1078 => "Subject--title",
553 1079 => "Subject--topical",
554 1080 => "Subject-uncontrolled",
555 1081 => "Terminology-chemical (chemical name)",
556 1082 => "Title-translated",
557 1083 => "Year-of-beginning",
558 1084 => "Year-of-ending",
559 1085 => "Subject-AGROVOC",
560 1086 => "Subject-COMPASS",
561 1087 => "Subject-EPT",
562 1088 => "Subject-NAL",
563 1089 => "Classification-BCM",
564 1090 => "Classification-DB",
565 1091 => "Identifier-ISRC",
566 1092 => "Identifier-ISMN",
567 1093 => "Identifier-ISRN",
568 1094 => "Identifier-DOI",
569 1095 => "Code-language-original",
570 1096 => "Title-later",
572 1098 => "DC-Creator",
573 1099 => "DC-Subject",
574 1100 => "DC-Description",
575 1101 => "DC-Publisher",
577 1103 => "DC-ResourceType",
578 1104 => "DC-ResourceIdentifier",
579 1105 => "DC-Language",
580 1106 => "DC-OtherContributor",
583 1109 => "DC-Relation",
584 1110 => "DC-Coverage",
585 1111 => "DC-RightsManagement",
586 1112 => "Controlled Subject Index",
587 1113 => "Subject Thesaurus",
588 1114 => "Index Terms -- Controlled",
589 1115 => "Controlled Term",
590 1116 => "Spatial Domain",
591 1117 => "Bounding Coordinates",
592 1118 => "West Bounding Coordinate",
593 1119 => "East Bounding Coordinate",
594 1120 => "North Bounding Coordinate",
595 1121 => "South Bounding Coordinate",
597 1123 => "Place Keyword Thesaurus",
598 1124 => "Place Keyword",
599 1125 => "Time Period",
600 1126 => "Time Period Textual",
601 1127 => "Time Period Structured",
602 1128 => "Beginning Date",
603 1129 => "Ending Date",
604 1130 => "Availability",
605 1131 => "Distributor",
606 1132 => "Distributor Name",
607 1133 => "Distributor Organization",
608 1134 => "Distributor Street Address",
609 1135 => "Distributor City",
610 1136 => "Distributor State or Province",
611 1137 => "Distributor Zip or Postal Code",
612 1138 => "Distributor Country",
613 1139 => "Distributor Network Address",
614 1140 => "Distributor Hours of Service",
615 1141 => "Distributor Telephone",
616 1142 => "Distributor Fax",
617 1143 => "Resource Description",
618 1144 => "Order Process",
619 1145 => "Order Information",
621 1147 => "Cost Information",
622 1148 => "Technical Prerequisites",
623 1149 => "Available Time Period",
624 1150 => "Available Time Textual",
625 1151 => "Available Time Structured",
626 1152 => "Available Linkage",
627 1153 => "Linkage Type",
629 1155 => "Sources of Data",
630 1156 => "Methodology",
631 1157 => "Access Constraints",
632 1158 => "General Access Constraints",
633 1159 => "Originator Dissemination Control",
634 1160 => "Security Classification Control",
635 1161 => "Use Constraints",
636 1162 => "Point of Contact",
637 1163 => "Contact Name",
638 1164 => "Contact Organization",
639 1165 => "Contact Street Address",
640 1166 => "Contact City",
641 1167 => "Contact State or Province",
642 1168 => "Contact Zip or Postal Code",
643 1169 => "Contact Country",
644 1170 => "Contact Network Address",
645 1171 => "Contact Hours of Service",
646 1172 => "Contact Telephone",
647 1173 => "Contact Fax",
648 1174 => "Supplemental Information",
650 1176 => "Agency Program",
651 1177 => "Cross Reference",
652 1178 => "Cross Reference Title",
653 1179 => "Cross Reference Relationship",
654 1180 => "Cross Reference Linkage",
655 1181 => "Schedule Number",
656 1182 => "Original Control Identifier",
657 1183 => "Language of Record",
658 1184 => "Record Review Date",
660 1186 => "Performer-Individual",
661 1187 => "Performer-Group",
662 1188 => "Instrumentation",
663 1189 => "Instrumentation-Original",
664 1190 => "Instrumentation-Current",
665 1191 => "Arrangement",
666 1192 => "Arrangement-Original",
667 1193 => "Arrangement-Current",
668 1194 => "Musical Key-Original",
669 1195 => "Musical Key-Current",
670 1196 => "Date-Composition",
671 1197 => "Date-Recording",
672 1198 => "Place-Recording",
673 1199 => "Country-Recording",
674 1200 => "Number-ISWC",
675 1201 => "Number-Matrix",
676 1202 => "Number-Plate",
677 1203 => "Classification-McColvin",
679 1205 => "Number-Copies",
680 1206 => "Musical Theme",
681 1207 => "Instruments - total number",
682 1208 => "Instruments - distinct number",
683 1209 => "Identifier - URN",
684 1210 => "Sears Subject Heading",
685 1211 => "OCLC Number",
686 1212 => "Composition",
687 1213 => "Intellectual level",
691 1217 => "Nationality",
693 1219 => "Compression",
695 1221 => "Subject - occupation",
696 1222 => "Subject - function",
700 sub bib1_access_point {
703 return $_bib1_access_point{$ap} ||
704 "unknown BIB-1 attribute '$ap'";
709 my($rs, $which, $elementSetName) = @_;
711 # There is a slight race condition here on the element-set name,
712 # but it shouldn't be a problem as this is (currently) only called
713 # from parts of the program that run single-threaded.
714 my $old = $rs->option(elementSetName => $elementSetName);
715 my $rec = $rs->record($which);
716 $rs->option(elementSetName => $old);
718 return $rec->render();