Rank tweak: follow=number will increase mult by number if two terms
occur next to each other; number-1 if they are one term apart , .. 0
if they are number a part (all in order). Default is 0 (following
terms has no effect).
Rank tweak: lead=number will increase mult by number if term is first
term in field, number-1 if second, ... 0 if term is at offset
number of more. Default value is 0 (position irrelevant).
Rank tweak: length=strategy. length="linear" if mult is to be divided
by length (existing, default behavior), length="log" if mult is to be
divided by log2(1+length), length="none" if mult is not to be affected
by length.
+
+Rank tweak: follow=number will increase mult by number if two terms
+occur next to each other; number-1 if they are one term apart , .. 0
+if they are number a part (all in order). Default is 0 (following
+terms has no effect).
+
+Rank tweak: lead=number will increase mult by number if term is first
+term in field, number-1 if second, ... 0 if term is at offset
+number of more. Default value is 0 (position irrelevant).
+
+Rank tweak: length=strategy. length="linear" if mult is to be divided
+by length (existing, default behavior), length="log" if mult is to be
+divided by log2(1+length), length="none" if mult is not to be affected
+by length.
+
--- 1.6.20 2012/09/21
Rank algorithm details may be printed as part of show reseponse in
--- 1.6.20 2012/09/21
Rank algorithm details may be printed as part of show reseponse in
{
// Initialize relevance structure with query terms
se->relevance = relevance_create_ccl(se->service->charsets, cn,
{
// Initialize relevance structure with query terms
se->relevance = relevance_create_ccl(se->service->charsets, cn,
- se->service->rank_cluster);
+ se->service->rank_cluster,
+ se->service->rank_follow,
+ se->service->rank_lead,
+ se->service->rank_length);
}
ccl_rpn_delete(cn);
return ret_value;
}
ccl_rpn_delete(cn);
return ret_value;
service->z3950_operation_timeout = 30;
service->rank_cluster = 1;
service->rank_debug = 0;
service->z3950_operation_timeout = 30;
service->rank_cluster = 1;
service->rank_debug = 0;
+ service->rank_follow = 0;
+ service->rank_lead = 0;
+ service->rank_length = 2;
{
char *rank_cluster = (char *) xmlGetProp(n, (xmlChar *) "cluster");
char *rank_debug = (char *) xmlGetProp(n, (xmlChar *) "debug");
{
char *rank_cluster = (char *) xmlGetProp(n, (xmlChar *) "cluster");
char *rank_debug = (char *) xmlGetProp(n, (xmlChar *) "debug");
+ char *rank_follow = (char *) xmlGetProp(n, (xmlChar *) "follow");
+ char *rank_lead = (char *) xmlGetProp(n, (xmlChar *) "lead");
+ char *rank_length= (char *) xmlGetProp(n, (xmlChar *) "length");
if (rank_cluster)
{
if (!strcmp(rank_cluster, "yes"))
if (rank_cluster)
{
if (!strcmp(rank_cluster, "yes"))
+ if (rank_follow)
+ {
+ service->rank_follow = atoi(rank_follow);
+ }
+ if (rank_lead)
+ {
+ service->rank_lead = atoi(rank_lead);
+ }
+ if (rank_length)
+ {
+ if (!strcmp(rank_length, "linear"))
+ service->rank_length = 2;
+ else if (!strcmp(rank_length, "log"))
+ service->rank_length = 1;
+ else if (!strcmp(rank_length, "none"))
+ service->rank_length = 0;
+ else
+ {
+ yaz_log(YLOG_FATAL, "service: rank@length linear|log|none");
+ return 0;
+ }
+ }
xmlFree(rank_cluster);
xmlFree(rank_debug);
xmlFree(rank_cluster);
xmlFree(rank_debug);
+ xmlFree(rank_follow);
+ xmlFree(rank_lead);
+ xmlFree(rank_length);
}
else if (!strcmp((const char *) n->name, "sort-default"))
{
}
else if (!strcmp((const char *) n->name, "sort-default"))
{
int z3950_operation_timeout;
int rank_cluster;
int rank_debug;
int z3950_operation_timeout;
int rank_cluster;
int rank_debug;
+ int rank_follow;
+ int rank_lead;
+ int rank_length;
char *default_sort;
int ref_count;
char *default_sort;
int ref_count;
char *merge_key;
int relevance_score;
int *term_frequency_vec;
char *merge_key;
int relevance_score;
int *term_frequency_vec;
- int *term_frequency_vec_tmp;
float *term_frequency_vecf;
// Set-specific ID for this record
char *recid;
float *term_frequency_vecf;
// Set-specific ID for this record
char *recid;
struct relevance
{
int *doc_frequency_vec;
struct relevance
{
int *doc_frequency_vec;
+ int *term_frequency_vec_tmp;
int vec_len;
struct word_entry *entries;
pp2_charset_token_t prt;
int rank_cluster;
int vec_len;
struct word_entry *entries;
pp2_charset_token_t prt;
int rank_cluster;
+ int follow_boost;
+ int lead_boost;
+ int length_divide;
const char *norm_str;
const char *display_str;
int termno;
const char *norm_str;
const char *display_str;
int termno;
char *ccl_field;
struct word_entry *next;
};
char *ccl_field;
struct word_entry *next;
};
-static int word_entry_match(struct word_entry *entries, const char *norm_str,
+static int word_entry_match(struct relevance *r, const char *norm_str,
const char *rank, int *mult)
{
const char *rank, int *mult)
{
- for (; entries; entries = entries->next)
+ int i = 1;
+ struct word_entry *entries = r->entries;
+ for (; entries; entries = entries->next, i++)
{
if (*norm_str && !strcmp(norm_str, entries->norm_str))
{
{
if (*norm_str && !strcmp(norm_str, entries->norm_str))
{
+ int extra = r->follow_boost;
+ struct word_entry *e_follow = entries;
const char *cp = 0;
int no_read = 0;
sscanf(rank, "%d%n", mult, &no_read);
const char *cp = 0;
int no_read = 0;
sscanf(rank, "%d%n", mult, &no_read);
memcmp(entries->ccl_field, rank, cp - rank) == 0)
*mult = atoi(cp + 1);
}
memcmp(entries->ccl_field, rank, cp - rank) == 0)
*mult = atoi(cp + 1);
}
+ (*mult) += entries->follow_boost;
+ while ((e_follow = e_follow->next) != 0 && extra > 0)
+ {
+ e_follow->follow_boost = extra--;
+ }
return entries->termno;
}
return entries->termno;
}
+ entries->follow_boost = 0;
const char *words, const char *rank,
const char *name)
{
const char *words, const char *rank,
const char *name)
{
- int *mult = cluster->term_frequency_vec_tmp;
+ int *mult = r->term_frequency_vec_tmp;
const char *norm_str;
int i, length = 0;
const char *norm_str;
int i, length = 0;
- struct word_entry *e = r->entries;
+ int lead_mult = r->lead_boost;
+ struct word_entry *e;
WRBUF w = cluster->relevance_explain1;
pp2_charset_token_first(r->prt, words, 0);
WRBUF w = cluster->relevance_explain1;
pp2_charset_token_first(r->prt, words, 0);
- for (i = 1; i < r->vec_len; i++)
+ for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
+ {
+ e->follow_boost = 0;
+ }
assert(rank);
while ((norm_str = pp2_charset_token_next(r->prt)))
{
int local_mult = 0;
assert(rank);
while ((norm_str = pp2_charset_token_next(r->prt)))
{
int local_mult = 0;
- int res = word_entry_match(r->entries, norm_str, rank, &local_mult);
+ int res = word_entry_match(r, norm_str, rank, &local_mult);
if (res)
{
assert(res < r->vec_len);
if (res)
{
assert(res < r->vec_len);
- mult[res] += local_mult;
+ mult[res] += local_mult + lead_mult;
+ if (lead_mult > 0)
+ --lead_mult;
- for (i = 1; i < r->vec_len; i++)
+ for (e = r->entries, i = 1; i < r->vec_len; i++, e = e->next)
- if (length > 0 && mult[i] > 0) /* only add if non-empty */
+ if (length == 0 || mult[i] == 0)
+ continue;
+ wrbuf_printf(w, "%s: field=%s vecf[%d] += mult(%d)",
+ e->display_str, name, i, mult[i]);
+ switch (r->length_divide)
- wrbuf_printf(w, "%s: field=%s vecf[%d] += mult(%d) / length(%d);\n",
- e->display_str, name, i, mult[i], length);
+ case 0:
+ wrbuf_printf(w, ";\n");
+ cluster->term_frequency_vecf[i] += (double) mult[i];
+ break;
+ case 1:
+ wrbuf_printf(w, " / log2(1+length(%d));\n", length);
+ cluster->term_frequency_vecf[i] +=
+ (double) mult[i] / log2(1 + length);
+ break;
+ case 2:
+ wrbuf_printf(w, " / length(%d);\n", length);
cluster->term_frequency_vecf[i] += (double) mult[i] / length;
}
cluster->term_frequency_vec[i] += mult[i];
cluster->term_frequency_vecf[i] += (double) mult[i] / length;
}
cluster->term_frequency_vec[i] += mult[i];
}
cluster->term_frequency_vec[0] += length;
}
cluster->term_frequency_vec[0] += length;
struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
struct ccl_rpn_node *query,
struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
struct ccl_rpn_node *query,
+ int rank_cluster,
+ int follow_boost, int lead_boost,
+ int length_divide)
{
NMEM nmem = nmem_create();
struct relevance *res = nmem_malloc(nmem, sizeof(*res));
{
NMEM nmem = nmem_create();
struct relevance *res = nmem_malloc(nmem, sizeof(*res));
res->entries = 0;
res->vec_len = 1;
res->rank_cluster = rank_cluster;
res->entries = 0;
res->vec_len = 1;
res->rank_cluster = rank_cluster;
+ res->follow_boost = follow_boost;
+ res->lead_boost = lead_boost;
+ res->length_divide = length_divide;
res->prt = pp2_charset_token_create(pft, "relevance");
pull_terms(res, query);
res->prt = pp2_charset_token_create(pft, "relevance");
pull_terms(res, query);
res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
for (i = 0; i < res->vec_len; i++)
res->doc_frequency_vec[i] = 0;
res->doc_frequency_vec = nmem_malloc(nmem, res->vec_len * sizeof(int));
for (i = 0; i < res->vec_len; i++)
res->doc_frequency_vec[i] = 0;
+
+ // worker array
+ res->term_frequency_vec_tmp =
+ nmem_malloc(res->nmem,
+ res->vec_len * sizeof(*res->term_frequency_vec_tmp));
r->vec_len * sizeof(*rec->term_frequency_vecf));
for (i = 0; i < r->vec_len; i++)
rec->term_frequency_vecf[i] = 0.0;
r->vec_len * sizeof(*rec->term_frequency_vecf));
for (i = 0; i < r->vec_len; i++)
rec->term_frequency_vecf[i] = 0.0;
-
- // for relevance_countwords (so we don't have to xmalloc/xfree)
- rec->term_frequency_vec_tmp =
- nmem_malloc(r->nmem,
- r->vec_len * sizeof(*rec->term_frequency_vec_tmp));
struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
struct ccl_rpn_node *query,
struct relevance *relevance_create_ccl(pp2_charset_fact_t pft,
struct ccl_rpn_node *query,
+ int rank_cluster, int follow_boost,
+ int lead_boost, int length_divide);
void relevance_destroy(struct relevance **rp);
void relevance_newrec(struct relevance *r, struct record_cluster *cluster);
void relevance_countwords(struct relevance *r, struct record_cluster *cluster,
void relevance_destroy(struct relevance **rp);
void relevance_newrec(struct relevance *r, struct record_cluster *cluster);
void relevance_countwords(struct relevance *r, struct record_cluster *cluster,