1 /* This file is part of Pazpar2.
2 Copyright (C) Index Data
4 Pazpar2 is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
9 Pazpar2 is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
27 #include <yaz/yaz-util.h>
29 #include "termlists.h"
30 #include "jenkins_hash.h"
33 // As terms are found in incoming records, they are added to (or updated in) a
34 // Hash table. When term records are updated, a frequency value is updated. At
35 // the same time, a highscore is maintained for the most frequent terms.
37 struct termlist_bucket
39 struct termlist_score term;
40 struct termlist_bucket *next;
45 struct termlist_bucket **hashtable;
52 struct termlist *termlist_create(NMEM nmem)
54 struct termlist *res = nmem_malloc(nmem, sizeof(struct termlist));
57 nmem_malloc(nmem, res->hash_size * sizeof(struct termlist_bucket*));
58 memset(res->hashtable, 0, res->hash_size * sizeof(struct termlist_bucket*));
64 void termlist_insert(struct termlist *tl, const char *display_term,
65 const char *norm_term, const char *id, size_t id_len,
69 struct termlist_bucket **p;
72 if (strlen(norm_term) > 255)
74 strcpy(buf, norm_term);
75 bucket = jenkins_hash((unsigned char *)buf) % tl->hash_size;
76 for (p = &tl->hashtable[bucket]; *p; p = &(*p)->next)
78 if (!strcmp(buf, (*p)->term.norm_term))
80 (*p)->term.frequency += freq;
84 if (!*p) // We made it to the end of the bucket without finding match
86 struct termlist_bucket *new = nmem_malloc(tl->nmem,
87 sizeof(struct termlist_bucket));
88 new->term.norm_term = nmem_strdup(tl->nmem, buf);
89 new->term.display_term = *display_term ?
90 nmem_strdup(tl->nmem, display_term) : new->term.norm_term;
91 new->term.id = id ? nmem_strdupn(tl->nmem, id, id_len) : 0;
92 new->term.frequency = freq;
99 static int compare(const void *s1, const void *s2)
101 struct termlist_score **p1 = (struct termlist_score **) s1;
102 struct termlist_score **p2 = (struct termlist_score **) s2;
103 int d = (*p2)->frequency - (*p1)->frequency;
106 return strcmp((*p1)->display_term, (*p2)->display_term);
109 struct termlist_score **termlist_highscore(struct termlist *tl, int *len,
112 struct termlist_score **highscore =
113 (struct termlist_score **)
114 nmem_malloc(nmem, tl->no_entries * sizeof(*highscore));
118 for (bucket = 0; bucket < tl->hash_size; bucket++)
120 struct termlist_bucket *p;
121 for (p = tl->hashtable[bucket]; p; p = p->next)
122 highscore[no++] = &p->term;
124 assert(no == tl->no_entries);
125 qsort(highscore, tl->no_entries, sizeof(struct termlist_score*), compare);
126 *len = tl->no_entries;
133 * c-file-style: "Stroustrup"
134 * indent-tabs-mode: nil
136 * vim: shiftwidth=4 tabstop=8 expandtab