-/* $Id: extract.c,v 1.156 2004-07-28 08:15:45 adam Exp $
+/* $Id: extract.c,v 1.162 2004-09-15 08:13:51 adam Exp $
Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
Index Data Aps
02111-1307, USA.
*/
-
#include <stdio.h>
#include <assert.h>
#include <ctype.h>
++zh->records_processed;
if (!(zh->records_processed % 1000))
{
- logf (LOG_LOG, "Records: %7d i/u/d %d/%d/%d",
+ logf (LOG_LOG, "Records: "ZINT_FORMAT" i/u/d "
+ ZINT_FORMAT"/"ZINT_FORMAT"/"ZINT_FORMAT,
zh->records_processed, zh->records_inserted, zh->records_updated,
zh->records_deleted);
}
int attrSetS, int attrUseS)
{
static const char *ws[32];
+ void *decode_handle = iscz1_start();
int off = 0;
int startSeq = -1;
- int i;
int seqno = 0;
-#if SU_SCHEME
- int chS, ch;
-#else
- short attrUse;
- char attrSet;
-#endif
+ int i;
for (i = 0; i<32; i++)
ws[i] = NULL;
-#if SU_SCHEME
- chS = zebraExplain_lookupSU (zh->reg->zei, attrSetS, attrUseS);
- if (chS < 0)
- return ws;
-#endif
while (off < reckeys->buf_used)
{
-
const char *src = reckeys->buf + off;
- const char *wstart;
- int lead;
-
- lead = *src++;
-#if SU_SCHEME
- if ((lead & 3)<3)
- {
- memcpy (&ch, src, sizeof(ch));
- src += sizeof(ch);
- }
-#else
- if (!(lead & 1))
- {
- memcpy (&attrSet, src, sizeof(attrSet));
- src += sizeof(attrSet);
- }
- if (!(lead & 2))
- {
- memcpy (&attrUse, src, sizeof(attrUse));
- src += sizeof(attrUse);
- }
-#endif
- wstart = src;
- while (*src++)
- ;
- if (lead & 60)
- seqno += ((lead>>2) & 15)-1;
- else
- {
- memcpy (&seqno, src, sizeof(seqno));
- src += sizeof(seqno);
- }
- if (
-#if SU_SCHEME
- ch == chS
-#else
- attrUseS == attrUse && attrSetS == attrSet
-#endif
- )
+ struct it_key key;
+ char *dst = (char*) &key;
+ int attrSet, attrUse;
+
+ iscz1_decode(decode_handle, &dst, &src);
+ assert(key.len < 4 && key.len > 2);
+
+ attrSet = (int) key.mem[0];
+ attrUse = (int) key.mem[1];
+ seqno = (int) key.mem[2];
+
+ if (attrUseS == attrUse && attrSetS == attrSet)
{
int woff;
-
if (startSeq == -1)
startSeq = seqno;
woff = seqno - startSeq;
if (woff >= 0 && woff < 31)
- ws[woff] = wstart;
+ ws[woff] = src;
}
+ while (*src++)
+ ;
off = src - reckeys->buf;
}
+ iscz1_stop(decode_handle);
assert (off == reckeys->buf_used);
return ws;
}
int recordOffset;
struct recordGroup *rGroup;
};
+
+void create_rec_keys_codec(struct recKeys *keys)
+{
+ keys->buf_used = 0;
+ iscz1_reset(keys->codec_handle);
+}
static int file_extract_record(ZebraHandle zh,
SYSNO *sysno, const char *fname,
/* we are going to read from a file, so prepare the extraction */
int i;
- zh->reg->keys.buf_used = 0;
- zh->reg->keys.prevAttrUse = -1;
- zh->reg->keys.prevAttrSet = -1;
- zh->reg->keys.prevSeqNo = 0;
+ create_rec_keys_codec(&zh->reg->keys);
+
zh->reg->sortKeys.buf_used = 0;
recordOffset = fi->file_moffset;
{
rinfo = dict_lookup (zh->reg->matchDict, matchStr);
if (rinfo)
+ {
+ assert(*rinfo == sizeof(*sysno));
memcpy (sysno, rinfo+1, sizeof(*sysno));
+ }
}
else
{
int delete_flag,
int test_mode,
const char *recordType,
- int *sysno,
+ SYSNO *sysno,
const char *match_criteria,
const char *fname,
int force_update,
extractCtrl.endf = zebra_record_int_end;
extractCtrl.fh = &fc;
- zh->reg->keys.buf_used = 0;
- zh->reg->keys.prevAttrUse = -1;
- zh->reg->keys.prevAttrSet = -1;
- zh->reg->keys.prevSeqNo = 0;
+ create_rec_keys_codec(&zh->reg->keys);
+
zh->reg->sortKeys.buf_used = 0;
if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0]))
if (matchStr) {
rinfo = dict_lookup (zh->reg->matchDict, matchStr);
if (rinfo)
+ {
+ assert(*rinfo == sizeof(*sysno));
memcpy (sysno, rinfo+1, sizeof(*sysno));
+ }
}
}
abort ();
}
- zh->reg->keys.buf_used = 0;
- zh->reg->keys.prevAttrUse = -1;
- zh->reg->keys.prevAttrSet = -1;
- zh->reg->keys.prevSeqNo = 0;
+ create_rec_keys_codec(&zh->reg->keys);
+
zh->reg->sortKeys.buf_used = 0;
extractCtrl.init = extract_init;
void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno,
int cmd, struct recKeys *reckeys)
{
-#if SU_SCHEME
-#else
- unsigned char attrSet = (unsigned char) -1;
- unsigned short attrUse = (unsigned short) -1;
-#endif
- int seqno = 0;
+ void *decode_handle = iscz1_start();
int off = 0;
int ch = 0;
ZebraExplainInfo zei = zh->reg->zei;
zh->reg->key_file_no = 0;
}
zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1);
+
while (off < reckeys->buf_used)
{
const char *src = reckeys->buf + off;
struct it_key key;
- int lead;
-
- lead = *src++;
+ char *dst = (char*) &key;
+ int attrSet, attrUse;
+
+ iscz1_decode(decode_handle, &dst, &src);
+ assert(key.len < 4 && key.len > 2);
+
+ attrSet = (int) key.mem[0];
+ attrUse = (int) key.mem[1]; /* sequence in mem[2] */
-#if SU_SCHEME
- if ((lead & 3) < 3)
- {
- memcpy (&ch, src, sizeof(ch));
- src += sizeof(ch);
- }
-#else
- if (!(lead & 1))
- {
- memcpy (&attrSet, src, sizeof(attrSet));
- src += sizeof(attrSet);
- }
- if (!(lead & 2))
- {
- memcpy (&attrUse, src, sizeof(attrUse));
- src += sizeof(attrUse);
- }
-#endif
if (zh->reg->key_buf_used + 1024 >
(zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*))
extract_flushWriteKeys (zh,0);
- assert(zh->reg->ptr_i >= 0);
++(zh->reg->ptr_i);
assert(zh->reg->ptr_i > 0);
(zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] =
(char*)zh->reg->key_buf + zh->reg->key_buf_used;
-#if SU_SCHEME
-#else
+
ch = zebraExplain_lookupSU (zei, attrSet, attrUse);
if (ch < 0)
ch = zebraExplain_addSU (zei, attrSet, attrUse);
-#endif
+
assert (ch > 0);
zh->reg->key_buf_used +=
key_SU_encode (ch,((char*)zh->reg->key_buf) +
zh->reg->key_buf_used);
-
while (*src)
((char*)zh->reg->key_buf) [(zh->reg->key_buf_used)++] = *src++;
src++;
((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = '\0';
((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = cmd;
- if (lead & 60)
- seqno += ((lead>>2) & 15)-1;
- else
- {
- memcpy (&seqno, src, sizeof(seqno));
- src += sizeof(seqno);
- }
- key.seqno = seqno;
- key.sysno = sysno;
- memcpy ((char*)zh->reg->key_buf + zh->reg->key_buf_used, &key, sizeof(key));
+ key.len = 2;
+ key.mem[0] = sysno;
+ key.mem[1] = key.mem[2]; /* sequence .. */
+
+ memcpy ((char*)zh->reg->key_buf + zh->reg->key_buf_used,
+ &key, sizeof(key));
(zh->reg->key_buf_used) += sizeof(key);
off = src - reckeys->buf;
}
assert (off == reckeys->buf_used);
+ iscz1_stop(decode_handle);
}
void extract_flushWriteKeys (ZebraHandle zh, int final)
zh->reg->key_buf_used = 0;
}
-void extract_add_index_string (RecWord *p, const char *string,
- int length)
+void extract_add_index_string (RecWord *p, const char *str, int length)
{
char *dst;
- unsigned char attrSet;
- unsigned short attrUse;
- int lead = 0;
- int diff = 0;
- int *pseqno = &p->seqno;
ZebraHandle zh = p->extractCtrl->handle;
- ZebraExplainInfo zei = zh->reg->zei;
struct recKeys *keys = &zh->reg->keys;
+ struct it_key key;
+ const char *src = (char*) &key;
if (keys->buf_used+1024 > keys->buf_max)
{
}
dst = keys->buf + keys->buf_used;
- /* leader byte is encoded as follows:
- bit 0 : 1 if attrset is unchanged; 0 if attrset is changed
- bit 1 : 1 if attruse is unchanged; 0 if attruse is changed
- */
- attrSet = p->attrSet;
- if (keys->buf_used > 0 && keys->prevAttrSet == attrSet)
- lead |= 1;
- else
- keys->prevAttrSet = attrSet;
- attrUse = p->attrUse;
- if (keys->buf_used > 0 && keys->prevAttrUse == attrUse)
- lead |= 2;
- else
- keys->prevAttrUse = attrUse;
-#if 1
- diff = 1 + *pseqno - keys->prevSeqNo;
- if (diff >= 1 && diff <= 15)
- lead |= (diff << 2);
- else
- diff = 0;
-#endif
- keys->prevSeqNo = *pseqno;
-
- *dst++ = lead;
+ key.len = 3;
+ key.mem[0] = p->attrSet;
+ key.mem[1] = p->attrUse;
+ key.mem[2] = p->seqno;
-#if SU_SCHEME
- if ((lead & 3) < 3)
- {
- int ch = zebraExplain_lookupSU (zei, attrSet, attrUse);
- if (ch < 0)
- {
- ch = zebraExplain_addSU (zei, attrSet, attrUse);
- yaz_log (LOG_DEBUG, "addSU set=%d use=%d SU=%d",
- attrSet, attrUse, ch);
- }
- assert (ch > 0);
- memcpy (dst, &ch, sizeof(ch));
- dst += sizeof(ch);
- }
-#else
- if (!(lead & 1))
- {
- memcpy (dst, &attrSet, sizeof(attrSet));
- dst += sizeof(attrSet);
- }
- if (!(lead & 2))
- {
- memcpy (dst, &attrUse, sizeof(attrUse));
- dst += sizeof(attrUse);
- }
+#if 0
+ /* just for debugging .. */
+ yaz_log(LOG_LOG, "set=%d use=%d seqno=%d", p->attrSet, p->attrUse,
+ p->seqno);
#endif
+
+ iscz1_encode(keys->codec_handle, &dst, &src);
+
*dst++ = p->reg_type;
- memcpy (dst, string, length);
+ memcpy (dst, str, length);
dst += length;
*dst++ = '\0';
-
- if (!diff)
- {
- memcpy (dst, pseqno, sizeof(*pseqno));
- dst += sizeof(*pseqno);
- }
keys->buf_used = dst - keys->buf;
}
-static void extract_add_sort_string (RecWord *p, const char *string,
+static void extract_add_sort_string (RecWord *p, const char *str,
int length)
{
ZebraHandle zh = p->extractCtrl->handle;
off += key_SU_encode(p->attrSet, sk->buf + off);
off += key_SU_encode(p->attrUse, sk->buf + off);
off += key_SU_encode(length, sk->buf + off);
- memcpy (sk->buf + off, string, length);
+ memcpy (sk->buf + off, str, length);
sk->buf_used = off + length;
}
const char **map = 0;
if (remain > 0)
- map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+ map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0);
while (map)
{
{
remain = p->length - (b - p->string);
if (remain > 0)
- map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+ map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0);
else
map = 0;
}
buf[i++] = *(cp++);
remain = p->length - (b - p->string);
if (remain > 0)
- map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+ map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, 0);
else
map = 0;
}
char buf[IT_MAX_WORD+1];
const char **map = 0;
int i = 0, remain = p->length;
+ int first; /* first position */
+
+yaz_log(LOG_DEBUG, "Complete field, w='%s'", p->string);
if (remain > 0)
- map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain);
+ map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain, 1);
while (remain > 0 && i < IT_MAX_WORD)
{
while (map && *map && **map == *CHR_SPACE)
{
remain = p->length - (b - p->string);
+
if (remain > 0)
- map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
+ {
+ first = i ? 0 : 1;
+ map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain, first);
+ }
else
map = 0;
}
{
const char *cp = *map;
- if (i >= IT_MAX_WORD)
- break;
- while (i < IT_MAX_WORD && *cp)
- buf[i++] = *(cp++);
+ if (**map == *CHR_CUT)
+ {
+ i = 0;
+ }
+ else
+ {
+ if (i >= IT_MAX_WORD)
+ break;
+ yaz_log(LOG_DEBUG, "Adding string to index '%d'", **map);
+ while (i < IT_MAX_WORD && *cp)
+ buf[i++] = *(cp++);
+ }
remain = p->length - (b - p->string);
if (remain > 0)
+ {
map = zebra_maps_input (p->zebra_maps, p->reg_type, &b,
- remain);
+ remain, 0);
+ }
else
map = 0;
}
i->prevseq=0;
i->prevcmd=-1;
i->keylen=0;
+ i->encode_handle = iscz1_start();
}
-char *encode_key_int (int d, char *bp)
-{
- if (d <= 63)
- *bp++ = d;
- else if (d <= 16383)
- {
- *bp++ = 64 + (d>>8);
- *bp++ = d & 255;
- }
- else if (d <= 4194303)
- {
- *bp++ = 128 + (d>>16);
- *bp++ = (d>>8) & 255;
- *bp++ = d & 255;
- }
- else
- {
- *bp++ = 192 + (d>>24);
- *bp++ = (d>>16) & 255;
- *bp++ = (d>>8) & 255;
- *bp++ = d & 255;
- }
- return bp;
-}
#define OLDENCODE 1
#ifdef OLDENCODE
void encode_key_write (char *k, struct encode_info *i, FILE *outf)
{
struct it_key key;
- char *bp = i->buf;
+ char *bp = i->buf, *bp0;
+ const char *src = (char *) &key;
+ /* copy term to output buf */
while ((*bp++ = *k++))
;
- memcpy (&key, k+1, sizeof(struct it_key));
- bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp);
- if (i->sysno != key.sysno)
- {
- i->sysno = key.sysno;
- i->seqno = 0;
- }
- else if (!i->seqno && !key.seqno && i->cmd == *k)
- return;
- bp = encode_key_int (key.seqno - i->seqno, bp);
- i->seqno = key.seqno;
- i->cmd = *k;
+ /* and copy & align key so we can mangle */
+ memcpy (&key, k+1, sizeof(struct it_key)); /* *k is insert/delete */
+
+ bp0 = bp++;
+ iscz1_encode(i->encode_handle, &bp, &src);
+ *bp0 = (*k * 128) + bp - bp0 - 1; /* length and insert/delete combined */
if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
{
logf (LOG_FATAL|LOG_ERRNO, "fwrite");
void encode_key_flush (struct encode_info *i, FILE *outf)
{ /* dummy routine */
+ iscz1_stop(i->encode_handle);
}
#else