-/* $Id: rsbetween.c,v 1.28 2004-10-22 10:12:51 heikki Exp $
+/* $Id: rsbetween.c,v 1.29 2004-11-01 15:53:57 heikki Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
Index Data Aps
/* rsbetween is (mostly) used for xml searches. It returns the hits of the
* "middle" rset, that are in between the "left" and "right" rsets. For
- * example "Shakespeare" in between "<title>" and </title>. The thing is
+ * example "Shakespeare" in between "<author>" and </author>. The thing is
* complicated by the inclusion of attributes (from their own rset). If attrs
* specified, they must match the "left" rset (start tag). "Hamlet" between
* "<title lang=eng>" and "</title>". (This assumes that the attributes are
* indexed to the same seqno as the tags).
+ *
+ * Currently fails to return all hits from a record, which breaks the ranking.
+ * This is bug #202. Seems like there is no decent way to get that working
+ * with the current implementation. I have planned a new method for doing
+ * this, see at the end of this file.
+ *
*/
#include <stdio.h>
void *buf_attr;
TERMID term_m; /* we only return terms for the mid argument */
int level; /* counting start/end tags */
+ int attr_match; /* did we have a matching attr for L */
zint hits;
};
#if RSBETWEEN_DEBUG
-static void log2 (struct rset_between_rfd *p, char *msg, int cmp_l, int cmp_r)
+static void log2 (RSFD rfd, char *msg, int cmp_l, int cmp_r)
{
- char buf_l[32];
- char buf_m[32];
- char buf_r[32];
- logf(LOG_DEBUG,"btw: %s l=%s(%d/%d) m=%s(%d) r=%s(%d/%d), lev=%d",
- msg,
- (*p->info->printer)(p->buf_l, buf_l), p->more_l, cmp_l,
- (*p->info->printer)(p->buf_m, buf_m), p->more_m,
- (*p->info->printer)(p->buf_r, buf_r), p->more_r, cmp_r,
- p->level);
+ struct rset_between_rfd *p=(struct rset_between_rfd *)rfd->priv;
+ RSET ct = rfd->rset;
+ logf(LOG_LOG,"between: %s cmp_l=%d cmp_r=%d", msg, cmp_l, cmp_r);
+ (*ct->keycontrol->key_logdump_txt)(LOG_LOG, p->buf_l, "between: L");
+ (*ct->keycontrol->key_logdump_txt)(LOG_LOG, p->buf_m, "between: M");
+ (*ct->keycontrol->key_logdump_txt)(LOG_LOG, p->buf_r, "between: R");
}
#endif
{
p->rfd_attr = rset_open (info->rset_attr, RSETF_READ);
p->more_attr = rset_read (p->rfd_attr, p->buf_attr, NULL);
- }
+ p->attr_match=0;
+ } else
+ p->attr_match=1;
p->level=0;
p->hits=0;
return rfd;
struct rset_between_info *info =(struct rset_between_info *)rfd->rset->priv;
struct rset_between_rfd *p=(struct rset_between_rfd *)rfd->priv;
+#if RSBETWEEN_DEBUG
+ log2( rfd, "fwd: close. hits:", p->hits,0);
+#endif
rset_close (p->rfd_l);
rset_close (p->rfd_m);
rset_close (p->rfd_r);
struct rset_between_rfd *p=(struct rset_between_rfd *)rfd->priv;
int rc;
#if RSBETWEEN_DEBUG
- log2( p, "fwd: before forward", 0,0);
+ log2( rfd, "fwd: before forward", 0,0);
#endif
/* It is enough to forward the m pointer here, the read will */
/* naturally forward the l, m, and attr pointers */
if (p->more_m)
p->more_m=rset_forward(p->rfd_m, p->buf_m, term, untilbuf);
#if RSBETWEEN_DEBUG
- log2( p, "fwd: after forward M", 0,0);
+ log2( rfd, "fwd: after forward M", 0,0);
#endif
rc = r_read_between(rfd, buf, term);
#if RSBETWEEN_DEBUG
- log2( p, "fwd: after forward", 0,0);
+ log2( rfd, "fwd: after forward", 0,0);
#endif
return rc;
}
const struct key_control *kctrl=rfd->rset->keycontrol;
int cmp_l=0;
int cmp_r=0;
- int attr_match = 0;
+/* int attr_match = 0; */
while (p->more_m)
{
#if RSBETWEEN_DEBUG
- log2( p, "start of loop", cmp_l, cmp_r);
+ log2( rfd, "start of loop", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d",
+ p->level,p->more_l,p->more_m, p->more_r);
#endif
/* forward L until past m, count levels, note rec boundaries */
cmp_l=rfd->rset->scope; /* past this record */
}
#if RSBETWEEN_DEBUG
- log2( p, "after first L", cmp_l, cmp_r);
+ log2( rfd, "after first L", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d",
+ p->level,p->more_l,p->more_m, p->more_r);
#endif
while (cmp_l < 0) /* l before m */
p->level++; /* relevant start tag */
if (!info->rset_attr)
- attr_match = 1;
+ p->attr_match = 1;
else
{
int cmp_attr;
- attr_match = 0;
+ p->attr_match = 0;
while (p->more_attr)
{
cmp_attr = (*kctrl->cmp)(p->buf_attr, p->buf_l);
if (cmp_attr == 0)
{
- attr_match = 1;
+ p->attr_match = 1;
break;
}
else if (cmp_attr > 0)
p->more_attr = rset_forward( p->rfd_attr,
p->buf_attr, NULL, p->buf_l);
#if RSBETWEEN_DEBUG
- logf(LOG_DEBUG, "btw: after frowarding attr m=%d",
+ logf(LOG_LOG, "btw: after frowarding attr m=%d",
p->more_attr);
#endif
}
else
cmp_l=rfd->rset->scope; /*2*/
#if RSBETWEEN_DEBUG
- log2( p, "after forwarding L", cmp_l, cmp_r);
+ log2( rfd, "after forwarding L", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d",
+ p->level,p->more_l,p->more_m, p->more_r);
#endif
}
} else
else
cmp_l=rfd->rset->scope; /*2*/
#if RSBETWEEN_DEBUG
- log2( p, "end of L loop", cmp_l, cmp_r);
+ log2( rfd, "end of L loop", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d",
+ p->level,p->more_l,p->more_m, p->more_r);
#endif
} /* forward L */
/* forward R until past m, count levels */
#if RSBETWEEN_DEBUG
- log2( p, "Before moving R", cmp_l, cmp_r);
+ log2( rfd, "Before moving R", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d",
+ p->level,p->more_l,p->more_m, p->more_r);
#endif
if (p->more_r)
cmp_r= (*kctrl->cmp)(p->buf_r, p->buf_m);
else
cmp_r=rfd->rset->scope; /*2*/
#if RSBETWEEN_DEBUG
- log2( p, "after first R", cmp_l, cmp_r);
+ log2( rfd, "after first R", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d",
+ p->level,p->more_l,p->more_m, p->more_r);
#endif
while (cmp_r < 0) /* r before m */
{
else
cmp_r=rfd->rset->scope; /*2*/
#if RSBETWEEN_DEBUG
- log2( p, "End of R loop", cmp_l, cmp_r);
+ log2( rfd, "End of R loop", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d am=%d",
+ p->level,p->more_l,p->more_m, p->more_r, p->attr_match);
#endif
} /* forward R */
if ( ( p->level <= 0 ) && ! p->more_l)
+ {
+#if RSBETWEEN_DEBUG
+ logf(LOG_LOG,"no more_l, returning zero");
+#endif
return 0; /* no more start tags, nothing more to find */
+ }
- if ( attr_match && p->level > 0) /* within a tag pair (or deeper) */
+#if RSBETWEEN_DEBUG
+ log2( rfd, "Considering M", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d am=%d",
+ p->level,p->more_l,p->more_m, p->more_r, p->attr_match);
+#endif
+ if ( p->attr_match && p->level > 0) /* within a tag pair (or deeper) */
{
memcpy (buf, p->buf_m, kctrl->key_size);
if (term)
*term=p->term_m;
#if RSBETWEEN_DEBUG
- log2( p, "Returning a hit (and forwarding m)", cmp_l, cmp_r);
+ log2( rfd, "Returning a hit (and forwarding m)", cmp_l, cmp_r);
#endif
p->more_m = rset_read (p->rfd_m, p->buf_m, NULL);
+#if RSBETWEEN_DEBUG
+ logf(LOG_LOG,"read m. more=%d level=%d "
+ "cmp_l=%d scope=%d hits="ZINT_FORMAT,
+ p->more_m, p->level,
+ cmp_l, rfd->rset->scope, p->hits);
+#endif
if (cmp_l >= rfd->rset->scope) /* == 2 */
p->level = 0;
p->hits++;
else if ( ! p->more_l ) /* not in data, no more starts */
{
#if RSBETWEEN_DEBUG
- log2( p, "no more starts, exiting without a hit", cmp_l, cmp_r);
+ log2( rfd, "no more starts, exiting without a hit", cmp_l, cmp_r);
#endif
return 0; /* ergo, nothing can be found. stop scanning */
}
#else
if (cmp_l >= rfd->rset->scope ) /* == 2 */
p->level = 0;
- p->more_m = rset_read (p->rfd_m, p->buf_m);
+ p->more_m = rset_read (p->rfd_m, p->buf_m, &p->term_m);
#endif
#if RSBETWEEN_DEBUG
- log2( p, "End of M loop", cmp_l, cmp_r);
+ log2( rfd, "End of M loop", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d",
+ p->level,p->more_l,p->more_m, p->more_r);
#endif
} /* while more_m */
#if RSBETWEEN_DEBUG
- log2( p, "Exiting, nothing more in m", cmp_l, cmp_r);
+ log2( rfd, "Exiting, nothing more in m", cmp_l, cmp_r);
+ logf(LOG_LOG,"level=%d. more=%d/%d/%d",
+ p->level,p->more_l,p->more_m, p->more_r);
#endif
return 0; /* no more data possible */
*current=p->hits;
*total=*current/r ;
#if RSBETWEEN_DEBUG
- yaz_log(LOG_DEBUG,"betw_pos: (%s/%s) %0.1f/%0.1f= %0.4f ",
+ {
+ struct rset_between_info *info =(struct rset_between_info *)rfd->rset->priv;
+ yaz_log(LOG_LOG,"betw_pos: (%s/%s) %0.1f/%0.1f= %0.4f ",
info->rset_l->control->desc, info->rset_r->control->desc,
*current, *total, r);
+ }
#endif
}
rset_getterms(info->rset_m, terms, maxterms, curterm);
}
+/* Better algorithm
+ * One of the major problems with rsbetween is the complexity of keeping track
+ * of start tags, stop tags, hits, and attributes, together with record
+ * boundaries etc.
+ *
+ * Things can be divided into finding the right records, and then processing
+ * hits inside the record.
+ *
+ * Finding the record is mostly a matter of forwarding until we have a start
+ * tag and a hit in the same record.
+ *
+ * Handling stuff inside a record, we can simplify things by implementing a
+ * glorified OR operator that returns all the occurrences in proper order,
+ * together with info on what type it was. Then the main logic can just keep
+ * reading, and consider each type separately:
+ * - if a start tag, increment level (some trickery with attributes!)
+ * - if a stop tag, decrement level
+ * - if a hit, and we have a level, return it
+ * - if a hit, but no level, ignore it
+ *
+ * The attributes can be detected when ever reading start tags. The main
+ * routine needs to keep a stack of attribute match bits, so when ever we read
+ * a start tag, we must report back if we have a matching attribute or not.
+ *
+ */
+