1 /* $Id: termlists.c,v 1.8 2007-05-10 09:24:32 adam Exp $
2 Copyright (c) 2006-2007, Index Data.
4 This file is part of Pazpar2.
6 Pazpar2 is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2, or (at your option) any later
11 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with Pazpar2; see the file LICENSE. If not, write to the
18 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
25 #include <yaz/yaz-util.h>
31 #include "termlists.h"
34 // As terms are found in incoming records, they are added to (or updated in) a
35 // Hash table. When term records are updated, a frequency value is updated. At
36 // the same time, a highscore is maintained for the most frequent terms.
38 struct termlist_bucket
40 struct termlist_score term;
41 struct termlist_bucket *next;
46 struct termlist_bucket **hashtable;
50 struct termlist_score **highscore;
59 // Jenkins one-at-a-time hash (from wikipedia)
60 static unsigned int hash(const unsigned char *key)
62 unsigned int hash = 0;
76 struct termlist *termlist_create(NMEM nmem, int numterms, int highscore_size)
82 // Calculate a hash size smallest power of 2 larger than 50% of expected numterms
83 halfnumterms = numterms >> 1;
86 while (hashsize < halfnumterms)
88 res = nmem_malloc(nmem, sizeof(struct termlist));
89 res->hashtable = nmem_malloc(nmem, hashsize * sizeof(struct termlist_bucket*));
90 memset(res->hashtable, 0, hashsize * sizeof(struct termlist_bucket*));
91 res->hashtable_size = hashsize;
93 res->hashmask = hashsize - 1; // Creates a bitmask
95 res->highscore = nmem_malloc(nmem, highscore_size * sizeof(struct termlist_score *));
96 res->highscore_size = highscore_size;
97 res->highscore_num = 0;
98 res->highscore_min = 0;
103 static void update_highscore(struct termlist *tl, struct termlist_score *t)
109 if (tl->highscore_num > tl->highscore_size && t->frequency < tl->highscore_min)
113 for (i = 0; i < tl->highscore_num; i++)
115 if (tl->highscore[i]->frequency < tl->highscore[smallest]->frequency)
117 if (tl->highscore[i] == t)
120 if (tl->highscore_num)
121 tl->highscore_min = tl->highscore[smallest]->frequency;
122 if (t->frequency < tl->highscore_min)
123 tl->highscore_min = t->frequency;
126 if (tl->highscore_num < tl->highscore_size)
128 tl->highscore[tl->highscore_num++] = t;
129 if (t->frequency < tl->highscore_min)
130 tl->highscore_min = t->frequency;
134 if (t->frequency > tl->highscore[smallest]->frequency)
136 tl->highscore[smallest] = t;
141 void termlist_insert(struct termlist *tl, const char *term)
144 struct termlist_bucket **p;
147 if (strlen(term) > 255)
151 for (cp = buf + strlen(buf); cp != buf && strchr(",. -", cp[-1]); cp--)
154 bucket = hash((unsigned char *)buf) & tl->hashmask;
155 for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
157 if (!strcmp(buf, (*p)->term.term))
159 (*p)->term.frequency++;
160 update_highscore(tl, &((*p)->term));
164 if (!*p) // We made it to the end of the bucket without finding match
166 struct termlist_bucket *new = nmem_malloc(tl->nmem,
167 sizeof(struct termlist_bucket));
168 new->term.term = nmem_strdup(tl->nmem, buf);
169 new->term.frequency = 1;
172 update_highscore(tl, &new->term);
176 static int compare(const void *s1, const void *s2)
178 struct termlist_score **p1 = (struct termlist_score**) s1, **p2 = (struct termlist_score **) s2;
179 return (*p2)->frequency - (*p1)->frequency;
182 struct termlist_score **termlist_highscore(struct termlist *tl, int *len)
184 qsort(tl->highscore, tl->highscore_num, sizeof(struct termlist_score*), compare);
185 *len = tl->highscore_num;
186 return tl->highscore;
192 * indent-tabs-mode: nil
194 * vim: shiftwidth=4 tabstop=8 expandtab