1 /* $Id: rsbetween.c,v 1.29 2004-11-01 15:53:57 heikki Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
24 /* rsbetween is (mostly) used for xml searches. It returns the hits of the
25 * "middle" rset, that are in between the "left" and "right" rsets. For
26 * example "Shakespeare" in between "<author>" and </author>. The thing is
27 * complicated by the inclusion of attributes (from their own rset). If attrs
28 * specified, they must match the "left" rset (start tag). "Hamlet" between
29 * "<title lang=eng>" and "</title>". (This assumes that the attributes are
30 * indexed to the same seqno as the tags).
32 * Currently fails to return all hits from a record, which breaks the ranking.
33 * This is bug #202. Seems like there is no decent way to get that working
34 * with the current implementation. I have planned a new method for doing
35 * this, see at the end of this file.
47 #define RSBETWEEN_DEBUG 0
49 static RSFD r_open_between (RSET ct, int flag);
50 static void r_close_between (RSFD rfd);
51 static void r_delete_between (RSET ct);
52 static int r_forward_between(RSFD rfd, void *buf,
53 TERMID *term, const void *untilbuf);
54 static int r_read_between (RSFD rfd, void *buf, TERMID *term );
55 static int r_write_between (RSFD rfd, const void *buf);
56 static void r_pos_between (RSFD rfd, double *current, double *total);
57 static void r_get_terms(RSET ct, TERMID *terms, int maxterms, int *curterm);
59 static const struct rset_control control =
73 const struct rset_control *rset_kind_between = &control;
75 struct rset_between_info {
76 RSET rset_l; /* left arg, start tag */
77 RSET rset_m; /* the thing itself */
78 RSET rset_r; /* right arg, end tag */
79 RSET rset_attr; /* attributes , optional */
82 struct rset_between_rfd {
95 TERMID term_m; /* we only return terms for the mid argument */
96 int level; /* counting start/end tags */
97 int attr_match; /* did we have a matching attr for L */
102 static void log2 (RSFD rfd, char *msg, int cmp_l, int cmp_r)
104 struct rset_between_rfd *p=(struct rset_between_rfd *)rfd->priv;
106 logf(LOG_LOG,"between: %s cmp_l=%d cmp_r=%d", msg, cmp_l, cmp_r);
107 (*ct->keycontrol->key_logdump_txt)(LOG_LOG, p->buf_l, "between: L");
108 (*ct->keycontrol->key_logdump_txt)(LOG_LOG, p->buf_m, "between: M");
109 (*ct->keycontrol->key_logdump_txt)(LOG_LOG, p->buf_r, "between: R");
113 RSET rsbetween_create( NMEM nmem, const struct key_control *kcontrol,
115 RSET rset_l, RSET rset_m, RSET rset_r, RSET rset_attr)
117 RSET rnew=rset_create_base(&control, nmem, kcontrol, scope,0);
118 struct rset_between_info *info=
119 (struct rset_between_info *) nmem_malloc(rnew->nmem,sizeof(*info));
120 info->rset_l = rset_l;
121 info->rset_m = rset_m;
122 info->rset_r = rset_r;
123 info->rset_attr = rset_attr;
129 static void r_delete_between (RSET ct)
131 struct rset_between_info *info = (struct rset_between_info *) ct->priv;
133 rset_delete (info->rset_l);
134 rset_delete (info->rset_m);
135 rset_delete (info->rset_r);
137 rset_delete (info->rset_attr);
141 static RSFD r_open_between (RSET ct, int flag)
143 struct rset_between_info *info = (struct rset_between_info *) ct->priv;
145 struct rset_between_rfd *p;
147 if (flag & RSETF_WRITE)
149 logf (LOG_FATAL, "between set type is read-only");
152 rfd=rfd_create_base(ct);
154 p=(struct rset_between_rfd *)rfd->priv;
156 p = (struct rset_between_rfd *) nmem_malloc(ct->nmem, (sizeof(*p)));
158 p->buf_l = nmem_malloc(ct->nmem, (ct->keycontrol->key_size));
159 p->buf_m = nmem_malloc(ct->nmem, (ct->keycontrol->key_size));
160 p->buf_r = nmem_malloc(ct->nmem, (ct->keycontrol->key_size));
161 p->buf_attr = nmem_malloc(ct->nmem, (ct->keycontrol->key_size));
164 p->rfd_l = rset_open (info->rset_l, RSETF_READ);
165 p->rfd_m = rset_open (info->rset_m, RSETF_READ);
166 p->rfd_r = rset_open (info->rset_r, RSETF_READ);
168 p->more_l = rset_read (p->rfd_l, p->buf_l,NULL);
169 p->more_m = rset_read (p->rfd_m, p->buf_m, &p->term_m);
170 p->more_r = rset_read (p->rfd_r, p->buf_r,NULL);
173 p->rfd_attr = rset_open (info->rset_attr, RSETF_READ);
174 p->more_attr = rset_read (p->rfd_attr, p->buf_attr, NULL);
183 static void r_close_between (RSFD rfd)
185 struct rset_between_info *info =(struct rset_between_info *)rfd->rset->priv;
186 struct rset_between_rfd *p=(struct rset_between_rfd *)rfd->priv;
189 log2( rfd, "fwd: close. hits:", p->hits,0);
191 rset_close (p->rfd_l);
192 rset_close (p->rfd_m);
193 rset_close (p->rfd_r);
195 rset_close (p->rfd_attr);
196 rfd_delete_base(rfd);
201 static int r_forward_between(RSFD rfd, void *buf,
202 TERMID *term, const void *untilbuf)
204 struct rset_between_rfd *p=(struct rset_between_rfd *)rfd->priv;
207 log2( rfd, "fwd: before forward", 0,0);
209 /* It is enough to forward the m pointer here, the read will */
210 /* naturally forward the l, m, and attr pointers */
212 p->more_m=rset_forward(p->rfd_m, p->buf_m, term, untilbuf);
214 log2( rfd, "fwd: after forward M", 0,0);
216 rc = r_read_between(rfd, buf, term);
218 log2( rfd, "fwd: after forward", 0,0);
226 static int r_read_between (RSFD rfd, void *buf, TERMID *term)
228 struct rset_between_info *info =(struct rset_between_info *)rfd->rset->priv;
229 struct rset_between_rfd *p=(struct rset_between_rfd *)rfd->priv;
230 const struct key_control *kctrl=rfd->rset->keycontrol;
233 /* int attr_match = 0; */
238 log2( rfd, "start of loop", cmp_l, cmp_r);
239 logf(LOG_LOG,"level=%d. more=%d/%d/%d",
240 p->level,p->more_l,p->more_m, p->more_r);
243 /* forward L until past m, count levels, note rec boundaries */
245 cmp_l= (*kctrl->cmp)(p->buf_l, p->buf_m);
249 cmp_l=rfd->rset->scope; /* past this record */
252 log2( rfd, "after first L", cmp_l, cmp_r);
253 logf(LOG_LOG,"level=%d. more=%d/%d/%d",
254 p->level,p->more_l,p->more_m, p->more_r);
257 while (cmp_l < 0) /* l before m */
259 if (cmp_l <= - rfd->rset->scope) /* ==-2 */
260 p->level=0; /* earlier record */
261 if (cmp_l > - rfd->rset->scope) /* == -1 */
263 p->level++; /* relevant start tag */
265 if (!info->rset_attr)
273 cmp_attr = (*kctrl->cmp)(p->buf_attr, p->buf_l);
279 else if (cmp_attr > 0)
281 else if (cmp_attr > - rfd->rset->scope) /* == -1 */
282 p->more_attr = rset_read (p->rfd_attr,
284 /* if we had a forward that went all the way to
285 * the seqno, we could use that. But fwd only goes
287 else if (cmp_attr <= - rfd->rset->scope) /* ==-2 */
289 p->more_attr = rset_forward( p->rfd_attr,
290 p->buf_attr, NULL, p->buf_l);
292 logf(LOG_LOG, "btw: after frowarding attr m=%d",
296 } /* while more_attr */
301 if (cmp_l <= - rfd->rset->scope )/* ==-2 */
305 p->more_l=rset_forward(p->rfd_l, p->buf_l, NULL, p->buf_m);
307 cmp_l= (*kctrl->cmp)(p->buf_l, p->buf_m);
309 cmp_l=rfd->rset->scope; /*2*/
311 log2( rfd, "after forwarding L", cmp_l, cmp_r);
312 logf(LOG_LOG,"level=%d. more=%d/%d/%d",
313 p->level,p->more_l,p->more_m, p->more_r);
318 p->more_l = rset_read (p->rfd_l, p->buf_l, NULL);
321 p->more_l = rset_read (p->rfd_l, p->buf_l, NULL);
325 cmp_l= (*kctrl->cmp)(p->buf_l, p->buf_m);
328 cmp_l=rfd->rset->scope; /*2*/
330 log2( rfd, "end of L loop", cmp_l, cmp_r);
331 logf(LOG_LOG,"level=%d. more=%d/%d/%d",
332 p->level,p->more_l,p->more_m, p->more_r);
337 /* forward R until past m, count levels */
339 log2( rfd, "Before moving R", cmp_l, cmp_r);
340 logf(LOG_LOG,"level=%d. more=%d/%d/%d",
341 p->level,p->more_l,p->more_m, p->more_r);
344 cmp_r= (*kctrl->cmp)(p->buf_r, p->buf_m);
346 cmp_r=rfd->rset->scope; /*2*/
348 log2( rfd, "after first R", cmp_l, cmp_r);
349 logf(LOG_LOG,"level=%d. more=%d/%d/%d",
350 p->level,p->more_l,p->more_m, p->more_r);
352 while (cmp_r < 0) /* r before m */
354 /* -2, earlier record, don't count level */
355 if (cmp_r > -rfd->rset->scope) /* == -1 */
356 p->level--; /* relevant end tag */
360 if (cmp_r <= - rfd->rset->scope) /* == -2 */
362 p->more_r=rset_forward(p->rfd_r, p->buf_r, NULL, p->buf_m);
365 p->more_r = rset_read (p->rfd_r, p->buf_r, NULL);
368 cmp_r= (*kctrl->cmp)(p->buf_r, p->buf_m);
371 p->more_r = rset_read (p->rfd_r, p->buf_r, NULL);
372 cmp_r= (*kctrl->cmp)(p->buf_r, p->buf_m);
376 cmp_r=rfd->rset->scope; /*2*/
378 log2( rfd, "End of R loop", cmp_l, cmp_r);
379 logf(LOG_LOG,"level=%d. more=%d/%d/%d am=%d",
380 p->level,p->more_l,p->more_m, p->more_r, p->attr_match);
384 if ( ( p->level <= 0 ) && ! p->more_l)
387 logf(LOG_LOG,"no more_l, returning zero");
389 return 0; /* no more start tags, nothing more to find */
393 log2( rfd, "Considering M", cmp_l, cmp_r);
394 logf(LOG_LOG,"level=%d. more=%d/%d/%d am=%d",
395 p->level,p->more_l,p->more_m, p->more_r, p->attr_match);
397 if ( p->attr_match && p->level > 0) /* within a tag pair (or deeper) */
399 memcpy (buf, p->buf_m, kctrl->key_size);
403 log2( rfd, "Returning a hit (and forwarding m)", cmp_l, cmp_r);
405 p->more_m = rset_read (p->rfd_m, p->buf_m, NULL);
407 logf(LOG_LOG,"read m. more=%d level=%d "
408 "cmp_l=%d scope=%d hits="ZINT_FORMAT,
410 cmp_l, rfd->rset->scope, p->hits);
412 if (cmp_l >= rfd->rset->scope) /* == 2 */
417 else if ( ! p->more_l ) /* not in data, no more starts */
420 log2( rfd, "no more starts, exiting without a hit", cmp_l, cmp_r);
422 return 0; /* ergo, nothing can be found. stop scanning */
425 if (cmp_l >= rfd->rset->scope) /* == 2 */
428 p->more_m=rset_forward(p->rfd_m, p->buf_m, &p->term_m, p->buf_l);
431 p->more_m = rset_read (p->rfd_m, p->buf_m, &p->term_m);
434 if (cmp_l >= rfd->rset->scope ) /* == 2 */
436 p->more_m = rset_read (p->rfd_m, p->buf_m, &p->term_m);
439 log2( rfd, "End of M loop", cmp_l, cmp_r);
440 logf(LOG_LOG,"level=%d. more=%d/%d/%d",
441 p->level,p->more_l,p->more_m, p->more_r);
446 log2( rfd, "Exiting, nothing more in m", cmp_l, cmp_r);
447 logf(LOG_LOG,"level=%d. more=%d/%d/%d",
448 p->level,p->more_l,p->more_m, p->more_r);
450 return 0; /* no more data possible */
456 static int r_write_between (RSFD rfd, const void *buf)
458 logf (LOG_FATAL, "between set type is read-only");
463 static void r_pos_between (RSFD rfd, double *current, double *total)
465 struct rset_between_rfd *p=(struct rset_between_rfd *)rfd->priv;
471 rset_pos(p->rfd_l, &lcur, <ot);
472 rset_pos(p->rfd_m, &mcur, &mtot);
473 rset_pos(p->rfd_r, &rcur, &rtot);
474 if ( (ltot<0) && (mtot<0) && (rtot<0) ) { /*no position */
475 *current=mcur; /* return same as you got */
476 *total=mtot; /* probably -1 for not available */
478 if ( ltot<0) { ltot=0; lcur=0;} /* if only one useful, use it */
479 if ( mtot<0) { mtot=0; mcur=0;}
480 if ( rtot<0) { rtot=0; rcur=0;}
481 if ( ltot+mtot+rtot < 1 ) { /* empty rset */
486 r=1.0*(lcur+mcur+rcur)/(ltot+mtot+rtot); /* weighed average of l and r */
491 struct rset_between_info *info =(struct rset_between_info *)rfd->rset->priv;
492 yaz_log(LOG_LOG,"betw_pos: (%s/%s) %0.1f/%0.1f= %0.4f ",
493 info->rset_l->control->desc, info->rset_r->control->desc,
494 *current, *total, r);
499 static void r_get_terms(RSET ct, TERMID *terms, int maxterms, int *curterm)
501 struct rset_between_info *info = (struct rset_between_info *) ct->priv;
502 rset_getterms(info->rset_m, terms, maxterms, curterm);
506 * One of the major problems with rsbetween is the complexity of keeping track
507 * of start tags, stop tags, hits, and attributes, together with record
510 * Things can be divided into finding the right records, and then processing
511 * hits inside the record.
513 * Finding the record is mostly a matter of forwarding until we have a start
514 * tag and a hit in the same record.
516 * Handling stuff inside a record, we can simplify things by implementing a
517 * glorified OR operator that returns all the occurrences in proper order,
518 * together with info on what type it was. Then the main logic can just keep
519 * reading, and consider each type separately:
520 * - if a start tag, increment level (some trickery with attributes!)
521 * - if a stop tag, decrement level
522 * - if a hit, and we have a level, return it
523 * - if a hit, but no level, ignore it
525 * The attributes can be detected when ever reading start tags. The main
526 * routine needs to keep a stack of attribute match bits, so when ever we read
527 * a start tag, we must report back if we have a matching attribute or not.