X-Git-Url: http://lists.indexdata.dk/cgi-bin?a=blobdiff_plain;f=relevance.c;h=c627a980a6c54a86692e9843c20d91120ee90fe1;hb=071dfd751f962f8f2e15d8bb106405a90a70549f;hp=fc85e3845072499da400ab209a33003c9504b3a1;hpb=d7967e62bb987396444404f5e6ed59bbda5f5131;p=pazpar2-moved-to-github.git diff --git a/relevance.c b/relevance.c index fc85e38..c627a98 100644 --- a/relevance.c +++ b/relevance.c @@ -1,28 +1,22 @@ /* - * $Id: relevance.c,v 1.1 2006-11-24 20:29:07 quinn Exp $ + * $Id: relevance.c,v 1.3 2006-11-27 14:35:15 quinn Exp $ */ #include +#include +#include #include "relevance.h" #include "pazpar2.h" struct relevance { - struct relevance_record *records; - int num_records; int *doc_frequency_vec; int vec_len; struct word_trie *wt; NMEM nmem; }; -struct relevance_record -{ - struct record *record; - int *term_frequency_vec; -}; - // We use this data structure to recognize terms in input records, // and map them to record term vectors for counting. struct word_trie @@ -55,21 +49,53 @@ static void word_trie_addterm(NMEM nmem, struct word_trie *n, const char *term, else { c -= 'a'; - if (!n->list[c].child) - { - struct word_trie *new = create_word_trie_node(nmem); - n->list[c].child = new; - } if (!*(++term)) n->list[c].termno = num; else + { + if (!n->list[c].child) + { + struct word_trie *new = create_word_trie_node(nmem); + n->list[c].child = new; + } word_trie_addterm(nmem, n->list[c].child, term, num); + } break; } } +} + +#define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' : -1) + +static int word_trie_match(struct word_trie *t, const char *word, int len, int *skipped) +{ + int c = raw_char(tolower(*word)); + + if (!len) + return 0; + + word++; len--; + (*skipped)++; + if (!len || raw_char(*word) < 0) + { + if (t->list[c].termno > 0) + return t->list[c].termno; + else + return 0; + } + else + { + if (t->list[c].child) + { + return word_trie_match(t->list[c].child, word, len, skipped); + } + else + return 0; + } } + static struct word_trie *build_word_trie(NMEM nmem, const char **terms) { struct word_trie *res = create_word_trie_node(nmem); @@ -93,38 +119,123 @@ struct relevance *relevance_create(NMEM nmem, const char **terms, int numrecs) res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int)); bzero(res->doc_frequency_vec, res->vec_len * sizeof(int)); res->nmem = nmem; - res->num_records = 0; - res->records = nmem_malloc(nmem, numrecs * sizeof(struct relevance_record *)); res->wt = build_word_trie(nmem, terms); return res; } -struct relevance_record *relevance_newrec(struct relevance *r, struct record *rec) +void relevance_newrec(struct relevance *r, struct record *rec) { - struct relevance_record *res = nmem_malloc(r->nmem, - sizeof(struct relevance_record)); - res->record = rec; - res->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int)); - bzero(res->term_frequency_vec, r->vec_len * sizeof(int)); - return res; + if (!rec->term_frequency_vec) + { + rec->term_frequency_vec = nmem_malloc(r->nmem, r->vec_len * sizeof(int)); + bzero(rec->term_frequency_vec, r->vec_len * sizeof(int)); + } } -void relevance_countwords(struct relevance_record *rec, const char *words, int len) + +// FIXME. The definition of a word is crude here.. should support +// some form of localization mechanism? +void relevance_countwords(struct relevance *r, struct record *head, + const char *words, int len, int multiplier) { + while (len) + { + char c; + int res; + int skipped; + while (len && (c = raw_char(tolower(*words))) < 0) + { + words++; + len--; + } + if (!len) + return; + skipped = 0; + if ((res = word_trie_match(r->wt, words, len, &skipped))) + { + words += skipped; + len -= skipped; + head->term_frequency_vec[res] += multiplier; + } + else + { + while (len && (c = raw_char(tolower(*words))) >= 0) + { + words++; + len--; + } + } + head->term_frequency_vec[0]++; + } } -void relevance_donerecord(struct relevance_record *rec) +void relevance_donerecord(struct relevance *r, struct record *head) { + int i; + + for (i = 1; i < r->vec_len; i++) + if (head->term_frequency_vec[i] > 0) + r->doc_frequency_vec[i]++; + + r->doc_frequency_vec[0]++; } -// Prepare for a relevance-sorted read of up to num entries -void relevance_prepare_read(struct relevance *r, int num) +#ifdef FLOAT_REL +static int comp(const void *p1, const void *p2) +{ + float res; + struct record **r1 = (struct record **) p1; + struct record **r2 = (struct record **) p2; + res = (*r2)->relevance - (*r1)->relevance; + if (res > 0) + return 1; + else if (res < 0) + return -1; + else + return 0; +} +#else +static int comp(const void *p1, const void *p2) { + struct record **r1 = (struct record **) p1; + struct record **r2 = (struct record **) p2; + return (*r2)->relevance - (*r1)->relevance; } +#endif -struct record *relevance_read(struct relevance *r) +// Prepare for a relevance-sorted read of up to num entries +void relevance_prepare_read(struct relevance *rel, struct reclist *reclist) { - return 0; + int i; + float *idfvec = xmalloc(rel->vec_len * sizeof(float)); + + // Calculate document frequency vector for each term. + for (i = 1; i < rel->vec_len; i++) + { + if (!rel->doc_frequency_vec[i]) + idfvec[i] = 0; + else + idfvec[i] = log((float) rel->doc_frequency_vec[0] / rel->doc_frequency_vec[i]); + } + // Calculate relevance for each document + for (i = 0; i < reclist->num_records; i++) + { + int t; + struct record *rec = reclist->flatlist[i]; + float relevance; + relevance = 0; + for (t = 1; t < rel->vec_len; t++) + { + float termfreq; + if (!rec->term_frequency_vec[0]) + break; + termfreq = (float) rec->term_frequency_vec[t] / rec->term_frequency_vec[0]; + relevance += termfreq * idfvec[t]; + } + rec->relevance = (int) (relevance * 100000); + } + qsort(reclist->flatlist, reclist->num_records, sizeof(struct record*), comp); + reclist->pointer = 0; } /*