* Sebastian Hammer, Adam Dickmeiss
*
* $Log: zrpn.c,v $
- * Revision 1.55 1996-11-04 14:07:44 adam
+ * Revision 1.61 1997-02-10 10:21:14 adam
+ * Bug fix: in search terms character (^) wasn't observed.
+ *
+ * Revision 1.60 1997/01/31 11:10:34 adam
+ * Bug fix: Leading and trailing white space weren't removed in scan tokens.
+ *
+ * Revision 1.59 1997/01/17 11:31:46 adam
+ * Bug fix: complete phrase search didn't work.
+ *
+ * Revision 1.58 1996/12/23 15:30:45 adam
+ * Work on truncation.
+ * Bug fix: result sets weren't deleted after server shut down.
+ *
+ * Revision 1.57 1996/11/11 13:38:02 adam
+ * Added proximity support in search.
+ *
+ * Revision 1.56 1996/11/08 11:10:32 adam
+ * Buffers used during file match got bigger.
+ * Compressed ISAM support everywhere.
+ * Bug fixes regarding masking characters in queries.
+ * Redesigned Regexp-2 queries.
+ *
+ * Revision 1.55 1996/11/04 14:07:44 adam
* Moved truncation code to trunc.c.
*
* Revision 1.54 1996/10/29 14:09:52 adam
return 0;
}
+static int term_pre (char **src, const char *ct1, const char *ct2)
+{
+ char *s1, *s0 = *src;
+ char **map;
+
+ /* skip white space */
+ while (*s0)
+ {
+ if (ct1 && strchr (ct1, *s0))
+ break;
+ if (ct2 && strchr (ct2, *s0))
+ break;
+ s1 = s0;
+ map = map_chrs_input (&s1, strlen(s1));
+ if (**map != *CHR_SPACE)
+ break;
+ s0 = s1;
+ }
+ *src = s0;
+ return *s0;
+}
+
+static int term_100 (char **src, char *dst, int space_split)
+{
+ char *s0, *s1, **map;
+ int i = 0;
+
+ if (!term_pre (src, NULL, NULL))
+ return 0;
+ s0 = *src;
+ while (*s0)
+ {
+ s1 = s0;
+ map = map_chrs_input (&s0, strlen(s0));
+ if (space_split && **map == *CHR_SPACE)
+ break;
+ while (s1 < s0)
+ {
+ if (!isalnum (*s1))
+ dst[i++] = '\\';
+ dst[i++] = *s1++;
+ }
+ }
+ dst[i] = '\0';
+ *src = s0;
+ return i;
+}
+
+static int term_101 (char **src, char *dst, int space_split)
+{
+ char *s0, *s1, **map;
+ int i = 0;
+
+ if (!term_pre (src, "#", "#"))
+ return 0;
+ s0 = *src;
+ while (*s0)
+ {
+ if (*s0 == '#')
+ {
+ dst[i++] = '.';
+ dst[i++] = '*';
+ s0++;
+ }
+ else
+ {
+ s1 = s0;
+ map = map_chrs_input (&s0, strlen(s0));
+ if (space_split && **map == *CHR_SPACE)
+ break;
+ while (s1 < s0)
+ {
+ if (!isalnum (*s1))
+ dst[i++] = '\\';
+ dst[i++] = *s1++;
+ }
+ }
+ }
+ dst[i] = '\0';
+ *src = s0;
+ return i;
+}
+
+
+static int term_103 (char **src, char *dst, int *errors, int space_split)
+{
+ int i = 0;
+ char *s0, *s1, **map;
+
+ if (!term_pre (src, "^\\()[].*+?|", "("))
+ return 0;
+ s0 = *src;
+ if (errors && *s0 == '+' && s0[1] && s0[2] == '+' && s0[3] &&
+ isdigit (s0[1]))
+ {
+ *errors = s0[1] - '0';
+ s0 += 3;
+ if (*errors > 3)
+ *errors = 3;
+ }
+ while (*s0)
+ {
+ if (strchr ("^\\()[].*+?|-", *s0))
+ dst[i++] = *s0++;
+ else
+ {
+ s1 = s0;
+ map = map_chrs_input (&s0, strlen(s0));
+ if (**map == *CHR_SPACE)
+ break;
+ while (s1 < s0)
+ {
+ if (!isalnum (*s1))
+ dst[i++] = '\\';
+ dst[i++] = *s1++;
+ }
+ }
+ }
+ dst[i] = '\0';
+ *src = s0;
+ return i;
+}
+
+static int term_102 (char **src, char *dst, int space_split)
+{
+ return term_103 (src, dst, NULL, space_split);
+}
+
/* gen_regular_rel - generate regular expression from relation
* val: border value (inclusive)
* islt: 1 if <=; 0 if >=.
}
static int relational_term (ZServerInfo *zi, Z_AttributesPlusTerm *zapt,
- const char *term_sub,
+ char **term_sub,
char *term_dict,
oid_value attributeSet,
struct grep_info *grep_info,
attr_init (&relation, zapt, 2);
relation_value = attr_find (&relation, NULL);
- term_value = atoi (term_sub);
switch (relation_value)
{
case 1:
+ if (!term_100 (term_sub, term_dict, 1))
+ return 0;
+ term_value = atoi (term_dict);
if (term_value <= 0)
return 1;
logf (LOG_DEBUG, "Relation <");
gen_regular_rel (term_dict + strlen(term_dict), term_value-1, 1);
break;
case 2:
+ if (!term_100 (term_sub, term_dict, 1))
+ return 0;
+ term_value = atoi (term_dict);
if (term_value < 0)
return 1;
logf (LOG_DEBUG, "Relation <=");
gen_regular_rel (term_dict + strlen(term_dict), term_value, 1);
break;
case 4:
+ if (!term_100 (term_sub, term_dict, 1))
+ return 0;
+ term_value = atoi (term_dict);
if (term_value < 0)
term_value = 0;
logf (LOG_DEBUG, "Relation >=");
gen_regular_rel (term_dict + strlen(term_dict), term_value, 0);
break;
case 5:
+ if (!term_100 (term_sub, term_dict, 1))
+ return 0;
+ term_value = atoi (term_dict);
if (term_value < 0)
term_value = 0;
logf (LOG_DEBUG, "Relation >");
return 1;
}
-static void verbatim_char (int ch, int *indx, char *dst)
-{
- if (!isalnum (ch))
- dst[(*indx)++] = '\\';
- dst[(*indx)++] = ch;
-}
-
static int field_term (ZServerInfo *zi, Z_AttributesPlusTerm *zapt,
- const char *term_sub, int regType,
+ char **term_sub, int regType,
oid_value attributeSet, struct grep_info *grep_info,
- int num_bases, char **basenames)
+ int num_bases, char **basenames, int space_split)
{
char term_dict[2*IT_MAX_WORD+2];
- int i, j, r, base_no;
+ int j, r, base_no;
AttrType truncation;
int truncation_value;
AttrType use;
int use_value;
oid_value curAttributeSet = attributeSet;
+ char *termp;
attr_init (&use, zapt, 1);
use_value = attr_find (&use, &curAttributeSet);
- logf (LOG_DEBUG, "use value %d", use_value);
+ logf (LOG_DEBUG, "field_term, use value %d", use_value);
attr_init (&truncation, zapt, 5);
truncation_value = attr_find (&truncation, NULL);
logf (LOG_DEBUG, "truncation value %d", truncation_value);
data1_local_attribute *local_attr;
int max_pos, prefix_len = 0;
+ termp = *term_sub;
attp = att_getentbyatt (curAttributeSet, use_value);
if (!attp)
{
term_dict[prefix_len++] = 1;
term_dict[prefix_len++] = regType;
term_dict[prefix_len] = '\0';
- if (!relational_term (zi, zapt, term_sub, term_dict,
+ if (!relational_term (zi, zapt, &termp, term_dict,
attributeSet, grep_info, &max_pos))
{
- const char *cp;
-
j = prefix_len;
switch (truncation_value)
{
case -1: /* not specified */
case 100: /* do not truncate */
- term_dict[j++] = '(';
- for (i = 0; term_sub[i]; i++)
- verbatim_char (term_sub[i], &j, term_dict);
- strcpy (term_dict+j, ")");
+ term_dict[j++] = '(';
+ if (!term_100 (&termp, term_dict + j, space_split))
+ return 0;
+ strcat (term_dict, ")");
r = dict_lookup_grep (zi->dict, term_dict, 0, grep_info,
&max_pos, 0, grep_handle);
if (r)
break;
case 1: /* right truncation */
term_dict[j++] = '(';
- for (i = 0; term_sub[i]; i++)
- verbatim_char (term_sub[i], &j, term_dict);
- strcpy (term_dict+j, ".*)");
+ if (!term_100 (&termp, term_dict + j, space_split))
+ return 0;
+ strcat (term_dict, ".*)");
dict_lookup_grep (zi->dict, term_dict, 0, grep_info,
&max_pos, 0, grep_handle);
break;
return -1;
case 101: /* process # in term */
term_dict[j++] = '(';
- for (i=0; term_sub[i]; i++)
- if (term_sub[i] == '#' && i > 2)
- {
- term_dict[j++] = '.';
- term_dict[j++] = '*';
- }
- else
- verbatim_char (term_sub[i], &j, term_dict);
- strcpy (term_dict+j, ")");
+ if (!term_101 (&termp, term_dict + j, space_split))
+ return 0;
+ strcat (term_dict, ")");
r = dict_lookup_grep (zi->dict, term_dict, 0, grep_info,
&max_pos, 0, grep_handle);
if (r)
- logf (LOG_WARN, "dict_lookup_grep err, trunc=#: %d",
- r);
+ logf (LOG_WARN, "dict_lookup_grep err, trunc=#: %d", r);
break;
- case 102: /* regular expression */
- sprintf (term_dict + j, "(%s)", term_sub);
+ case 102: /* Regexp-1 */
+ term_dict[j++] = '(';
+ if (!term_102 (&termp, term_dict + j, space_split))
+ return 0;
+ strcat (term_dict, ")");
+ logf (LOG_DEBUG, "Regexp-1 tolerance=%d", r);
r = dict_lookup_grep (zi->dict, term_dict, 0, grep_info,
&max_pos, 0, grep_handle);
if (r)
logf (LOG_WARN, "dict_lookup_grep err, trunc=regular: %d",
r);
break;
- case 103: /* regular expression with error correction */
- cp = term_sub;
- r = 0;
- if (*cp == '*' && cp[1] && cp[2])
- {
- r = atoi (cp+1);
- cp += 2;
- }
- sprintf (term_dict + j, "(%s)", cp);
+ case 103: /* Regexp-1 */
+ r = 1;
+ term_dict[j++] = '(';
+ if (!term_103 (&termp, term_dict + j, &r, space_split))
+ return 0;
+ strcat (term_dict, ")");
+ logf (LOG_DEBUG, "Regexp-2 tolerance=%d", r);
r = dict_lookup_grep (zi->dict, term_dict, r, grep_info,
- &max_pos, j, grep_handle);
+ &max_pos, 2, grep_handle);
if (r)
logf (LOG_WARN, "dict_lookup_grep err, trunc=eregular: %d",
r);
}
}
}
+ *term_sub = termp;
logf (LOG_DEBUG, "%d positions", grep_info->isam_p_indx);
- return 0;
+ return 1;
}
static void trans_term (ZServerInfo *zi, Z_AttributesPlusTerm *zapt,
const char *cp_end = cp + term->u.general->len;
const char *src;
int i = 0;
- int prev_space = 0;
+ const char *space_map = NULL;
int len;
while ((len = (cp_end - cp)) > 0)
{
map = map_chrs_input (&cp, len);
if (**map == *CHR_SPACE)
- {
- if (prev_space)
- continue;
- prev_space = 1;
- }
+ space_map = *map;
else
- prev_space = 0;
- for (src = *map; *src; src++)
- termz[i++] = *src;
+ {
+ if (i && space_map)
+ for (src = space_map; *src; src++)
+ termz[i++] = *src;
+ space_map = NULL;
+ for (src = *map; *src; src++)
+ termz[i++] = *src;
+ }
}
termz[i] = '\0';
}
{
rset_relevance_parms parms;
char termz[IT_MAX_WORD+1];
- char term_sub[IT_MAX_WORD+1];
+ char *termp = termz;
struct grep_info grep_info;
- char *p0 = termz;
RSET result;
int term_index = 0;
+ int r;
parms.key_size = sizeof(struct it_key);
- parms.max_rec = 100;
- parms.cmp = key_compare;
+ parms.max_rec = 1000;
+ parms.cmp = key_compare_it;
parms.is = zi->isam;
+ parms.isc = zi->isamc;
parms.no_terms = 0;
if (zapt->term->which != Z_Term_general)
grep_info.isam_p_buf = NULL;
while (1)
{
- char **map;
- char *p2, *p1;
-
- p1 = p0;
- while (*(p0 = p1))
- {
- map = map_chrs_input (&p1, strlen(p1));
- if (**map != *CHR_SPACE)
- break;
- }
- if (!*p0)
- break;
-
- p1 = p0;
- while (*(p2 = p1))
- {
- map = map_chrs_input (&p1, strlen(p1));
- if (**map == *CHR_SPACE)
- break;
- }
- if (p2 == p0)
+ r = field_term (zi, zapt, &termp, 'w', attributeSet, &grep_info,
+ num_bases, basenames, 1);
+ if (r <= 0)
break;
- memcpy (term_sub, p0, p2-p0);
- term_sub[p2-p0] = '\0';
- p0 = p2;
- if (field_term (zi, zapt, term_sub, 'w', attributeSet, &grep_info,
- num_bases, basenames))
- return NULL;
#ifdef TERM_COUNT
for (; term_index < grep_info.isam_p_indx; term_index++)
grep_info.term_no[term_index] = parms.no_terms;
char termz[IT_MAX_WORD+1];
struct grep_info grep_info;
RSET result;
+ char *termp = termz;
+ int r;
if (zapt->term->which != Z_Term_general)
{
grep_info.isam_p_size = 0;
grep_info.isam_p_buf = NULL;
- if (field_term (zi, zapt, termz, 'p', attributeSet, &grep_info,
- num_bases, basenames))
- return NULL;
+ r = field_term (zi, zapt, &termp, 'p', attributeSet, &grep_info,
+ num_bases, basenames, 0);
result = rset_trunc (zi, grep_info.isam_p_buf, grep_info.isam_p_indx);
#ifdef TERM_COUNT
xfree(grep_info.term_no);
return result;
}
+static RSET rpn_proximity (RSET rset1, RSET rset2, int ordered,
+ int exclusion, int relation, int distance)
+{
+ int i;
+ RSFD rsfd1, rsfd2;
+ int more1, more2;
+ struct it_key buf1, buf2;
+ RSFD rsfd_result;
+ RSET result;
+ rset_temp_parms parms;
+
+ rsfd1 = rset_open (rset1, RSETF_READ|RSETF_SORT_SYSNO);
+ more1 = rset_read (rset1, rsfd1, &buf1);
+
+ rsfd2 = rset_open (rset2, RSETF_READ|RSETF_SORT_SYSNO);
+ more2 = rset_read (rset2, rsfd2, &buf2);
+
+ parms.key_size = sizeof (struct it_key);
+ result = rset_create (rset_kind_temp, &parms);
+ rsfd_result = rset_open (result, RSETF_WRITE|RSETF_SORT_SYSNO);
+
+ logf (LOG_DEBUG, "rpn_proximity excl=%d ord=%d rel=%d dis=%d",
+ exclusion, ordered, relation, distance);
+ while (more1 && more2)
+ {
+ int cmp = key_compare_it (&buf1, &buf2);
+ if (cmp < -1)
+ more1 = rset_read (rset1, rsfd1, &buf1);
+ else if (cmp > 1)
+ more2 = rset_read (rset2, rsfd2, &buf2);
+ else
+ {
+ int sysno = buf1.sysno;
+ int seqno[500];
+ int n = 0;
+
+ seqno[n++] = buf1.seqno;
+ while ((more1 = rset_read (rset1, rsfd1, &buf1)) &&
+ sysno == buf1.sysno)
+ if (n < 500)
+ seqno[n++] = buf1.seqno;
+ do
+ {
+ for (i = 0; i<n; i++)
+ {
+ int diff = buf2.seqno - seqno[i];
+ int excl = exclusion;
+ if (!ordered && diff < 0)
+ diff = -diff;
+ switch (relation)
+ {
+ case 1: /* < */
+ if (diff < distance)
+ excl = !excl;
+ break;
+ case 2: /* <= */
+ if (diff <= distance)
+ excl = !excl;
+ break;
+ case 3: /* == */
+ if (diff == distance)
+ excl = !excl;
+ break;
+ case 4: /* >= */
+ if (diff >= distance)
+ excl = !excl;
+ break;
+ case 5: /* > */
+ if (diff > distance)
+ excl = !excl;
+ break;
+ case 6: /* != */
+ if (diff != distance)
+ excl = !excl;
+ break;
+ }
+ if (excl)
+ rset_write (result, rsfd_result, &buf2);
+ }
+ } while ((more2 = rset_read (rset2, rsfd2, &buf2)) &&
+ sysno == buf2.sysno);
+ }
+ }
+ rset_close (result, rsfd_result);
+ rset_close (rset1, rsfd1);
+ rset_close (rset2, rsfd2);
+ return result;
+}
+
static RSET rpn_prox (RSET *rset, int rset_no)
{
int i;
*more = 0;
break;
}
- cmp = key_compare (buf[i], buf[i-1]);
+ cmp = key_compare_it (buf[i], buf[i-1]);
if (cmp > 1)
{
more[i-1] = rset_read (rset[i-1], rsfd[i-1], buf[i-1]);
int num_bases, char **basenames)
{
char termz[IT_MAX_WORD+1];
- char term_sub[IT_MAX_WORD+1];
- char *p0 = termz;
+ char *termp = termz;
RSET rset[60], result;
- int i, rset_no = 0;
+ int i, r, rset_no = 0;
struct grep_info grep_info;
if (zapt->term->which != Z_Term_general)
while (1)
{
- char **map;
- char *p2, *p1;
-
- p1 = p0;
- while (*(p0 = p1))
- {
- map = map_chrs_input (&p1, strlen(p1));
- if (**map != *CHR_SPACE)
- break;
- }
- if (!*p0)
- break;
-
- p1 = p0;
- while (*(p2 = p1))
- {
- map = map_chrs_input (&p1, strlen(p1));
- if (**map == *CHR_SPACE)
- break;
- }
- if (p2 == p0)
- break;
-
- memcpy (term_sub, p0, p2-p0);
- term_sub[p2-p0] = '\0';
- p0 = p2;
-
grep_info.isam_p_indx = 0;
- if (field_term (zi, zapt, term_sub, 'w', attributeSet, &grep_info,
- num_bases, basenames))
- return NULL;
+ r = field_term (zi, zapt, &termp, 'w', attributeSet, &grep_info,
+ num_bases, basenames, 1);
+ if (r < 1)
+ break;
rset[rset_no] = rset_trunc (zi, grep_info.isam_p_buf,
grep_info.isam_p_indx);
assert (rset[rset_no]);
return rset_create (rset_kind_null, NULL);
else if (rset_no == 1)
return (rset[0]);
-
result = rpn_prox (rset, rset_no);
for (i = 0; i<rset_no; i++)
rset_delete (rset[i]);
RSET r = NULL;
if (zs->which == Z_RPNStructure_complex)
{
+ Z_Operator *zop = zs->u.complex->roperator;
rset_bool_parms bool_parms;
int soft = 0;
+
bool_parms.rset_l = rpn_search_structure (zi, zs->u.complex->s1,
attributeSet,
if (rset_is_ranked(bool_parms.rset_r))
soft = 1;
bool_parms.key_size = sizeof(struct it_key);
- bool_parms.cmp = key_compare;
+ bool_parms.cmp = key_compare_it;
- switch (zs->u.complex->roperator->which)
+ switch (zop->which)
{
case Z_Operator_and:
r = rset_create (soft ? rset_kind_sand:rset_kind_and, &bool_parms);
case Z_Operator_and_not:
r = rset_create (soft ? rset_kind_snot:rset_kind_not, &bool_parms);
break;
+ case Z_Operator_prox:
+ if (zop->u.prox->which != Z_ProxCode_known)
+ {
+ zi->errCode = 132;
+ return NULL;
+ }
+ if (*zop->u.prox->proximityUnitCode != Z_ProxUnit_word)
+ {
+ static char val[16];
+ zi->errCode = 132;
+ zi->errString = val;
+ sprintf (val, "%d", *zop->u.prox->proximityUnitCode);
+ return NULL;
+ }
+ r = rpn_proximity (bool_parms.rset_l, bool_parms.rset_r,
+ *zop->u.prox->ordered,
+ (!zop->u.prox->exclusion ? 0 :
+ *zop->u.prox->exclusion),
+ *zop->u.prox->relationType,
+ *zop->u.prox->distance);
+ break;
default:
- assert (0);
+ zi->errCode = 110;
+ return NULL;
}
}
else if (zs->which == Z_RPNStructure_simple)
}
else
{
- assert (0);
+ zi->errCode = 3;
+ return NULL;
}
}
else
{
- assert (0);
+ zi->errCode = 3;
+ return NULL;
}
return r;
}
RSFD rfd, wfd;
RSET w;
rset_temp_parms parms;
-
+ int maxResultSetSize = atoi (res_get_def (common_resource,
+ "maxResultSetSize", "400"));
logf (LOG_DEBUG, "count_set_save");
*count = 0;
parms.key_size = sizeof(struct it_key);
rfd = rset_open (*r, RSETF_READ|RSETF_SORT_SYSNO);
while (rset_read (*r, rfd, &key))
{
- logf (LOG_DEBUG, "sysno=%-7d seqno=%d", key.sysno, key.seqno);
if (key.sysno != psysno)
{
- rset_write (w, wfd, &key);
- psysno = key.sysno;
+ if (*count < maxResultSetSize)
+ rset_write (w, wfd, &key);
(*count)++;
+ psysno = key.sysno;
}
kno++;
}
idx = scan_info->after - pos + scan_info->before;
else
idx = - pos - 1;
- logf (LOG_DEBUG, "%-3d %s", idx, name+len_prefix);
scan_info->list[idx].term = odr_malloc (scan_info->odr,
strlen(name + len_prefix)+1);
strcpy (scan_info->list[idx].term, name + len_prefix);
rset_trunc (zi, &scan_info_array[j].list[ptr[j]].isam_p, 1);
bool_parms.key_size = sizeof(struct it_key);
- bool_parms.cmp = key_compare;
+ bool_parms.cmp = key_compare_it;
bool_parms.rset_l = rset;
bool_parms.rset_r = rset2;
&scan_info_array[j].list[before-1-ptr[j]].isam_p, 1);
bool_parms.key_size = sizeof(struct it_key);
- bool_parms.cmp = key_compare;
+ bool_parms.cmp = key_compare_it;
bool_parms.rset_l = rset;
bool_parms.rset_r = rset2;