2 * $Id: termlists.c,v 1.5 2007-01-15 19:01:15 quinn Exp $
8 #include <yaz/yaz-util.h>
14 #include "termlists.h"
17 // As terms are found in incoming records, they are added to (or updated in) a
18 // Hash table. When term records are updated, a frequency value is updated. At
19 // the same time, a highscore is maintained for the most frequent terms.
21 struct termlist_bucket
23 struct termlist_score term;
24 struct termlist_bucket *next;
29 struct termlist_bucket **hashtable;
33 struct termlist_score **highscore;
42 // Jenkins one-at-a-time hash (from wikipedia)
43 static unsigned int hash(const unsigned char *key)
45 unsigned int hash = 0;
59 struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size)
65 // Calculate a hash size smallest power of 2 larger than 50% of expected numterms
66 halfnumterms = numterms >> 1;
69 while (hashsize < halfnumterms)
71 res = nmem_malloc(nmem, sizeof(struct termlist));
72 res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*));
73 memset(res->hashtable, 0, hashsize * sizeof(struct termlist_bucket*));
74 res->hashtable_size = hashsize;
76 res->hashmask = hashsize - 1; // Creates a bitmask
78 res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
79 res->highscore_size = highscore_size;
80 res->highscore_num = 0;
81 res->highscore_min = 0;
86 static void update_highscore(struct termlist *tl, struct termlist_score *t)
92 if (tl->highscore_num > tl->highscore_size && t->frequency < tl->highscore_min)
96 for (i = 0; i < tl->highscore_num; i++)
98 if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency)
100 if (tl->highscore[i] == t)
103 if (tl->highscore_num)
104 tl->highscore_min = tl->highscore[smallest]->frequency;
105 if (t->frequency < tl->highscore_min)
106 tl->highscore_min = t->frequency;
109 if (tl->highscore_num < tl->highscore_size)
111 tl->highscore[tl->highscore_num++] = t;
112 if (t->frequency < tl->highscore_min)
113 tl->highscore_min = t->frequency;
117 if (t->frequency > tl->highscore[smallest]->frequency)
119 tl->highscore[smallest] = t;
124 void termlist_insert(struct termlist *tl, const char *term)
127 struct termlist_bucket **p;
130 if (strlen(term) > 255)
133 for (cp = buf + strlen(buf) - 1; cp > buf &&
134 (*cp == ',' || *cp == '.' || *cp == ' '); cp--)
137 bucket = hash((unsigned char *)buf) & tl->hashmask;
138 for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
140 if (!strcmp(buf, (*p)->term.term))
142 (*p)->term.frequency++;
143 update_highscore(tl, &((*p)->term));
147 if (!*p) // We made it to the end of the bucket without finding match
149 struct termlist_bucket *new = nmem_malloc(tl->nmem,
150 sizeof(struct termlist_bucket));
151 new->term.term = nmem_strdup(tl->nmem, buf);
152 new->term.frequency = 1;
155 update_highscore(tl, &new->term);
159 static int compare(const void *s1, const void *s2)
161 struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2;
162 return (*p2)->frequency - (*p1)->frequency;
165 struct termlist_score **termlist_highscore(struct termlist *tl, int *len)
167 qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare);
168 *len = tl->highscore_num;
169 return tl->highscore;
175 * indent-tabs-mode: nil
177 * vim: shiftwidth=4 tabstop=8 expandtab