3 The University of Liverpool
5 Modifications to Zebra 1.1 / YAZ 1.7 to enable ranking
8 Copyright (c) 2001-2002 The University of Liverpool. All
11 Licensed under the Academic Free License version 1.1.
12 http://opensource.org/licenses/academic.php
14 $Id: livcode.c,v 1.4 2004-10-26 15:32:11 heikki Exp $
18 #ifdef SKIPTHIS /* Need to fix the interface - FIXME */
33 ** These functions/routines
34 ** 1. reads in and builds a linked list of rank attr/rank score pairs
35 ** 2. expand a simple query into a paired list of complex/simple nodes.
40 struct rstype *next_rsnode ;
44 } rsnode, *refrsnode ;
46 refrsnode start_rsnode = NULL ;
49 ** Function/Routine prototypes
51 static int search_for_score( char *rankstr ) ;
52 static char *search_for_rankstr( int rank ) ;
53 static int search_for_rank( int rank ) ;
54 static refrsnode set_rsnode( int rank, int score ) ;
55 static int read_zrank_file(ZebraHandle zh) ;
57 static void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ;
58 static void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct ) ;
59 static Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop ) ;
60 static Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp ) ;
61 static Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 ) ;
62 static Z_Operand *set_operand( Z_Operand *thisop, int newattr ) ;
63 static int check_operand_attrs( Z_Operand *thisop ) ;
67 ** given a rank-string traverse down the linked list ;
68 ** return its score if found otherwise return -1.
70 int search_for_score( char *rankstr )
72 refrsnode node = start_rsnode ;
75 if ( sscanf( rankstr,"%d",&rank ) )
79 if ( node->rank == rank ) return node->score ;
80 node = node->next_rsnode ;
87 ** search_for_rankstr()
88 ** given a rank traverse down the linked list ;
89 ** return its string if found otherwise return NULL.
91 char *search_for_rankstr( int rank )
93 refrsnode node = start_rsnode ;
97 if ( node->rank == rank ) return node->rankstr ;
98 node = node->next_rsnode ;
105 ** given a rank traverse down the linked list ;
106 ** return 1 if found otherwise return 0.
108 int search_for_rank( int rank )
110 refrsnode node = start_rsnode ;
114 if ( node->rank == rank ) return 1 ;
115 node = node->next_rsnode ;
122 ** given a rank and a score, build the rest of the rsnode structure.
124 refrsnode set_rsnode( int rank, int score )
127 refrsnode node = (refrsnode)malloc( sizeof(rsnode) ) ;
130 node->next_rsnode = NULL ;
132 node->score = score ;
134 sprintf( buff,"%d",rank ) ;
135 node->rankstr = (char *)malloc( strlen(buff)+1 ) ;
136 strcpy( node->rankstr, buff ) ;
142 ** read_zrank_file(zh)
143 ** read in the rankfile and build the rank/score linked list ;
144 ** return 0 : can't open the zebra config. file
145 ** return 0 : can't find the rankfile entry in the zebra config. file
146 ** return 0 : can't open the rankfile itself
147 ** return the number of distinct ranks read in.
149 int read_zrank_file(ZebraHandle zh)
152 char line[ LINEMAX ] ;
153 char rname[ LINEMAX ] ;
161 ** open the zebra configuration file and look for the "rankfile:"
162 ** entry which contains the path/name of the rankfile
165 const char *rankfile = res_get_def(zh->res, "rankfile", 0);
166 const char *profilePath = res_get_def(zh->res, "profilePath",
167 DEFAULT_PROFILE_PATH);
171 yaz_log(LOG_LOG, "rankfile entry not found in config file" ) ;
174 ifd = yaz_path_fopen(profilePath, rankfile, "r" ) ;
177 while ( (lineptr = fgets( line,LINEMAX,ifd )) )
179 if ( sscanf( lineptr,"rankfile: %s", rname ) == 1 )
184 ** open the rankfile and read the rank/score pairs
186 ** ignore duplicate ranks
187 ** ignore ranks without +ve scores
191 if ( !(ifd = fopen( rankfile, "r" )) )
193 logf( LOG_LOG, "unable to open rankfile %s",rankfile ) ;
197 while ( (lineptr = fgets( line,LINEMAX,ifd )) )
199 sscanf( lineptr,"%d : %d", &rank,&score ) ;
200 if ( ( score > 0 ) && ( rank != 1016 ) )
202 refrsnode new_rsnode ;
204 if ( search_for_rank( rank ) == 0 )
206 new_rsnode = set_rsnode( rank,score ) ;
207 new_rsnode->next_rsnode = start_rsnode ;
208 start_rsnode = new_rsnode ;
216 yaz_log(LOG_WARN|LOG_ERRNO, "unable to open config file (%s)",
225 ** build an operand "node" - hav to make a complete copy of thisop and
226 ** then insert newattr in the appropriate place
229 Z_Operand *set_operand( Z_Operand *thisop, int newattr )
232 Z_AttributesPlusTerm *attributesplusterm ;
233 Z_AttributeList *attributelist ;
234 Z_AttributeElement *attributeelement ;
235 Z_AttributeElement *attrptr ;
236 Z_AttributeElement **attrptrptr ;
241 operand = (Z_Operand *)
242 malloc( sizeof( Z_Operand ) ) ;
243 attributesplusterm = (Z_AttributesPlusTerm *)
244 malloc( sizeof( Z_AttributesPlusTerm ) ) ;
245 attributelist = (Z_AttributeList *)
246 malloc( sizeof( Z_AttributeList ) ) ;
247 attributeelement = (Z_AttributeElement *)
248 malloc( sizeof( Z_AttributeElement ) ) ;
250 malloc( sizeof( Z_Term ) ) ;
251 general = (Odr_oct *)
252 malloc( sizeof( Odr_oct ) ) ;
254 operand->which = Z_Operand_APT ;
255 operand->u.attributesPlusTerm = attributesplusterm ;
257 attributesplusterm->attributes = attributelist ;
258 attributesplusterm->term = term ;
260 attributelist->num_attributes = thisop->u.attributesPlusTerm->
261 attributes->num_attributes ;
263 attrptr = (Z_AttributeElement *) malloc( sizeof(Z_AttributeElement) *
264 attributelist->num_attributes ) ;
265 attrptrptr = (Z_AttributeElement **) malloc( sizeof(Z_AttributeElement) *
266 attributelist->num_attributes ) ;
268 attributelist->attributes = attrptrptr ;
270 for ( i = 0 ; i < attributelist->num_attributes ; i++ )
272 *attrptr = *thisop->u.attributesPlusTerm->attributes->attributes[i] ;
274 attrptr->attributeType = (int *)malloc( sizeof(int *) ) ;
275 *attrptr->attributeType = *thisop->u.attributesPlusTerm->attributes->
276 attributes[i]->attributeType;
278 attrptr->value.numeric = (int *)malloc( sizeof(int *) ) ;
279 *attrptr->value.numeric = *thisop->u.attributesPlusTerm->attributes->
280 attributes[i]->value.numeric;
282 if ( (*attrptr->attributeType == 1) &&
283 (*attrptr->value.numeric == 1016) )
285 *attrptr->value.numeric = newattr ;
287 *attrptrptr++ = attrptr++ ;
290 term->which = Z_Term_general ;
291 term->u.general = general ;
293 general->len = thisop->u.attributesPlusTerm->term->u.general->len ;
294 general->size = thisop->u.attributesPlusTerm->term->u.general->size ;
295 general->buf = malloc( general->size ) ;
296 strcpy( general->buf,
297 thisop->u.attributesPlusTerm->term->u.general->buf ) ;
304 ** build a complex "node" with two (simple) operand "nodes" as branches
306 Z_Complex *set_2operands( Z_Operand *sim1,Z_Operand *sim2 )
311 Z_Operator *roperator ;
313 top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ;
314 s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
315 s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
316 roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ;
318 top->roperator = roperator ;
319 top->roperator->which = Z_Operator_or ;
320 top->roperator->u.op_or = odr_nullval() ;
323 top->s1->which = Z_RPNStructure_simple ;
324 top->s1->u.simple = sim1 ;
327 top->s2->which = Z_RPNStructure_simple ;
328 top->s2->u.simple = sim2 ;
334 ** set_1complex_1operand()
335 ** build a complex "node" with a complex "node" branch and an
336 ** operand "node" branch
338 Z_Complex *set_1complex_1operand( Z_Complex *comp,Z_Operand *simp )
343 Z_Operator *roperator ;
345 top = (Z_Complex *) malloc( sizeof( Z_Complex ) ) ;
346 s1 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
347 s2 = (Z_RPNStructure *)malloc( sizeof( Z_RPNStructure ) ) ;
348 roperator = (Z_Operator *) malloc( sizeof( Z_Operator ) ) ;
350 top->roperator = roperator ;
351 top->roperator->which = Z_Operator_or ;
352 top->roperator->u.op_or = odr_nullval() ;
355 top->s1->which = Z_RPNStructure_complex ;
356 top->s1->u.complex = comp ;
359 top->s2->which = Z_RPNStructure_simple ;
360 top->s2->u.simple = simp ;
367 ** expand a simple query into a number of complex queries
369 Z_Complex *expand_query(ZebraHandle zh, Z_Operand *thisop )
375 ** start_rsnode will be set if we have already read the rankfile
376 ** so don't bother again but we need to know the number of attributes
377 ** in the linked list so traverse it again to find out how many.
381 refrsnode node = start_rsnode ;
385 node = node->next_rsnode ;
390 ** only expand the query if there are 2 or more attributes
394 refrsnode node = start_rsnode ;
398 attr1 = node->rank ; node = node->next_rsnode ;
399 attr2 = node->rank ; node = node->next_rsnode ;
402 ** this is the special case and has to be done first because the
403 ** last complex node in the linear list has two simple nodes whereas
404 ** all the others have a complex and a simple.
406 top = set_2operands( set_operand( thisop,attr1 ),
407 set_operand( thisop,attr2 ) ) ;
410 ** do the rest as complex/simple pairs
414 attr1 = node->rank ; node = node->next_rsnode ;
415 top = set_1complex_1operand( top,set_operand( thisop,attr1 ) ) ;
418 ** finally add the 1016 rank attribute at the top of the tree
420 top = set_1complex_1operand( top,set_operand( thisop,1016 ) ) ;
428 ** check_operand_attrs()
429 ** loop through the attributes of a particular operand
430 ** return 1 if (type==1 && value==1016) && (type==2 && value==102)
431 ** otherwise return 0
433 int check_operand_attrs( Z_Operand *thisop )
435 Z_AttributeElement *attrptr ;
441 numattrs = thisop->u.attributesPlusTerm->attributes->num_attributes ;
443 for ( i = 0 ; i < numattrs ; i++ )
445 attrptr = thisop->u.attributesPlusTerm->attributes->attributes[i] ;
447 if ( (*attrptr->attributeType == 1) &&
448 (*attrptr->value.numeric == 1016) )
451 if ( (*attrptr->attributeType == 2) &&
452 (*attrptr->value.numeric == 102) )
456 return (cond1 & cond2) ;
460 ** convert_simple2complex()
463 void convert_simple2complex(ZebraHandle zh, Z_RPNStructure *rpnstruct )
465 Z_Complex *complex = NULL ;
466 Z_Operand *operand = rpnstruct->u.simple ;
468 if ( check_operand_attrs( operand ) )
470 complex = expand_query(zh, operand ) ;
475 ** Everything is complete so replace the original
476 ** operand with the newly built complex structure
477 ** This is it ... no going back!!
479 rpnstruct->which = Z_RPNStructure_complex ;
480 rpnstruct->u.complex = complex ;
486 ** walk_complex_query()
487 ** recursively traverse the tree expanding any simple queries we find
489 void walk_complex_query(ZebraHandle zh, Z_RPNStructure *rpnstruct )
491 if ( rpnstruct->which == Z_RPNStructure_simple )
493 convert_simple2complex(zh, rpnstruct ) ;
497 walk_complex_query(zh, rpnstruct->u.complex->s1 ) ;
498 walk_complex_query(zh, rpnstruct->u.complex->s2 ) ;
502 void zebra_livcode_transform(ZebraHandle zh, Z_RPNQuery *query)
505 ** Got a search request,
506 ** 1. if it is a simple query, see if it suitable for expansion
507 ** i.e. the attributes are of the form ...
508 ** (type==1 && value==1016) && (type==2 && value==102)
510 ** 2. if it is complex, traverse the complex query tree and expand
511 ** any simples simples as above
514 Z_RPNStructure *rpnstruct = query->RPNStructure ;
516 if ( rpnstruct->which == Z_RPNStructure_simple )
518 convert_simple2complex(zh, rpnstruct ) ;
520 else if ( rpnstruct->which == Z_RPNStructure_complex )
522 walk_complex_query(zh, rpnstruct ) ;
528 struct rank_class_info {
532 struct rank_term_info {
539 struct rank_set_info {
544 struct rank_term_info *entries;
547 static int log2_int (unsigned g)
556 * create: Creates/Initialises this rank handler. This routine is
557 * called exactly once. The routine returns the class_handle.
559 static void *create (ZebraHandle zh)
561 struct rank_class_info *ci = (struct rank_class_info *)
562 xmalloc (sizeof(*ci));
564 logf (LOG_DEBUG, "livrank create");
566 read_zrank_file(zh) ;
572 * destroy: Destroys this rank handler. This routine is called
573 * when the handler is no longer needed - i.e. when the server
574 * dies. The class_handle was previously returned by create.
576 static void destroy (struct zebra_register *reg, void *class_handle)
578 struct rank_class_info *ci = (struct rank_class_info *) class_handle;
580 logf (LOG_DEBUG, "livrank destroy");
586 * begin: Prepares beginning of "real" ranking. Called once for
587 * each result set. The returned handle is a "set handle" and
588 * will be used in each of the handlers below.
590 static void *begin (struct zebra_register *reg, void *class_handle,
591 RSET rset, NMEM nmem)
593 struct rank_set_info *si = (struct rank_set_info *) xmalloc (sizeof(*si));
596 logf (LOG_DEBUG, "livrank begin");
597 /* FIXME - Now that we don't have term counts in rsets, what do we */
598 /* do about this ??? */
599 si->no_entries = 0; /* rset->no_rset_terms; */ /* FIXME ??? */
600 si->no_rank_entries = 0;
602 si->entries = (struct rank_term_info *)
603 xmalloc (sizeof(*si->entries)*si->no_entries);
604 for (i = 0; i < si->no_entries; i++)
606 const char *flags = ""; /* rset->rset_terms[i]->flags; *//* FIXME ???*/
607 int g = 0; /* rset->rset_terms[i]->nn; */ /* FIXME ??? */
608 const char *cp = strstr(flags, ",u=");
610 si->entries[i].rank_flag = 1;
613 char *t = search_for_rankstr(atoi(cp+3));
615 si->entries[i].rank_flag = search_for_score(t) ;
617 if ( si->entries[i].rank_flag )
618 (si->no_rank_entries)++;
620 si->entries[i].local_occur = 0;
621 si->entries[i].global_occur = g;
622 si->entries[i].global_inv = 32 - log2_int (g);
623 logf (LOG_DEBUG, "-------- %d ------", 32 - log2_int (g));
629 * end: Terminates ranking process. Called after a result set
632 static void end (struct zebra_register *reg, void *set_handle)
634 struct rank_set_info *si = (struct rank_set_info *) set_handle;
635 logf (LOG_DEBUG, "livrank end");
641 * add: Called for each word occurence in a result set. This routine
642 * should be as fast as possible. This routine should "incrementally"
645 static void add (void *set_handle, int seqno, int term_index)
647 struct rank_set_info *si = (struct rank_set_info *) set_handle;
648 logf (LOG_DEBUG, "rank-1 add seqno=%d term_index=%d", seqno, term_index);
649 si->last_pos = seqno;
650 si->entries[term_index].local_occur++;
654 * calc: Called for each document in a result. This handler should
655 * produce a score based on previous call(s) to the add handler. The
656 * score should be between 0 and 1000. If score cannot be obtained
657 * -1 should be returned.
659 static int calc (void *set_handle, zint sysno)
661 int i, lo, divisor, score = 0;
662 struct rank_set_info *si = (struct rank_set_info *) set_handle;
664 logf (LOG_DEBUG, "livrank calc sysno=" ZINT_FORMAT, sysno);
666 if (!si->no_rank_entries)
668 for (i = 0; i < si->no_entries; i++)
670 score += si->entries[i].local_occur * si->entries[i].rank_flag ;
672 for (i = 0; i < si->no_entries; i++)
673 if (si->entries[i].rank_flag && (lo = si->entries[i].local_occur))
674 score += (8+log2_int (lo)) * si->entries[i].global_inv;
676 divisor = si->no_rank_entries * (8+log2_int (si->last_pos/si->no_entries));
677 score = score / divisor;
680 for (i = 0; i < si->no_entries; i++)
681 si->entries[i].local_occur = 0;
686 * Pseudo-meta code with sequence of calls as they occur in a
687 * server. Handlers are prefixed by --:
703 static struct rank_control rank_control = {
713 struct rank_control *rankliv_class = &rank_control;