projects
/
pazpar2-moved-to-github.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Merge branch 'master' into paz-927
[pazpar2-moved-to-github.git]
/
src
/
relevance.c
diff --git
a/src/relevance.c
b/src/relevance.c
index
4cbf7f2
..
e484ca9
100644
(file)
--- a/
src/relevance.c
+++ b/
src/relevance.c
@@
-1,5
+1,5
@@
/* This file is part of Pazpar2.
/* This file is part of Pazpar2.
- Copyright (C) 2006-2013 Index Data
+ Copyright (C) Index Data
Pazpar2 is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Pazpar2 is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
@@
-77,6
+77,7
@@
const int scorefield_none = -1; // Do not normalize anything, use tf/idf as is
// This is the old behavior, and the default
const int scorefield_internal = -2; // use our tf/idf, but normalize it
const int scorefield_position = -3; // fake a score based on the position
// This is the old behavior, and the default
const int scorefield_internal = -2; // use our tf/idf, but normalize it
const int scorefield_position = -3; // fake a score based on the position
+// Positive numbers indicate the field to be used for scoring.
// A structure for each (sub)record. There is one list for each client
struct norm_record
// A structure for each (sub)record. There is one list for each client
struct norm_record
@@
-130,7
+131,7
@@
struct norm_client *findnorm( struct relevance *rel, struct client* client)
}
}
-// Add a record in the list for that client, for normalizing later
+// Add all records from a cluster into the list for that client, for normalizing later
static void setup_norm_record( struct relevance *rel, struct record_cluster *clust)
{
struct record *record;
static void setup_norm_record( struct relevance *rel, struct record_cluster *clust)
{
struct record *record;
@@
-158,6
+159,7
@@
static void setup_norm_record( struct relevance *rel, struct record_cluster *cl
}
yaz_log(YLOG_LOG,"Got score for %d/%d : %f ",
norm->num, record->position, rp->score );
}
yaz_log(YLOG_LOG,"Got score for %d/%d : %f ",
norm->num, record->position, rp->score );
+ record -> score = rp->score;
if ( norm->count == 1 )
{
norm->max = rp->score;
if ( norm->count == 1 )
{
norm->max = rp->score;
@@
-165,8
+167,8
@@
static void setup_norm_record( struct relevance *rel, struct record_cluster *cl
} else {
if ( rp->score > norm->max )
norm->max = rp->score;
} else {
if ( rp->score > norm->max )
norm->max = rp->score;
- if ( rp->score < norm->min && abs(rp->score) < 1e-6 )
- norm->min = rp->score; // skip zeroes
+ if ( rp->score < norm->min )
+ norm->min = rp->score;
}
}
}
}
}
}
@@
-190,14
+192,15
@@
static double squaresum( struct norm_record *rp, double a, double b)
static void normalize_scores(struct relevance *rel)
{
const int maxiterations = 1000;
static void normalize_scores(struct relevance *rel)
{
const int maxiterations = 1000;
- const double enough = 1000.0; // sets the number of decimals we are happy with
+ const double enough = 100.0; // sets the number of decimals we are happy with
const double stepchange = 0.5; // reduction of the step size when finding middle
// 0.5 sems to be magical, much better than 0.4 or 0.6
struct norm_client *norm;
for ( norm = rel->norm; norm; norm = norm->next )
{
const double stepchange = 0.5; // reduction of the step size when finding middle
// 0.5 sems to be magical, much better than 0.4 or 0.6
struct norm_client *norm;
for ( norm = rel->norm; norm; norm = norm->next )
{
- yaz_log(YLOG_LOG,"Normalizing client %d: scorefield=%d count=%d range=%f %f",
- norm->num, norm->scorefield, norm->count, norm->min, norm->max);
+ yaz_log(YLOG_LOG,"Normalizing client %d: scorefield=%d count=%d range=%f %f = %f",
+ norm->num, norm->scorefield, norm->count, norm->min,
+ norm->max, norm->max-norm->min);
norm->a = 1.0; // default normalizing factors, no change
norm->b = 0.0;
if ( norm->scorefield != scorefield_none &&
norm->a = 1.0; // default normalizing factors, no change
norm->b = 0.0;
if ( norm->scorefield != scorefield_none &&
@@
-210,13
+213,26
@@
static void normalize_scores(struct relevance *rel)
double chi;
char *branch = "?";
// initial guesses for the parameters
double chi;
char *branch = "?";
// initial guesses for the parameters
+ // Rmax = a * rmax + b # want to be 1.0
+ // Rmin = a * rmin + b # want to be 0.0
+ // Rmax - Rmin = a ( rmax - rmin ) # subtracting equations
+ // 1.0 - 0.0 = a ( rmax - rmin )
+ // a = 1 / range
+ // Rmin = a * rmin + b
+ // b = Rmin - a * rmin
+ // = 0.0 - 1/range * rmin
+ // = - rmin / range
+
if ( range < 1e-6 ) // practically zero
range = norm->max;
a = 1.0 / range;
if ( range < 1e-6 ) // practically zero
range = norm->max;
a = 1.0 / range;
- b = abs(norm->min);
+ b = -1.0 * norm->min / range;
+ // b = fabs(norm->min) / range;
as = a / 10;
as = a / 10;
- bs = b / 10;
+ bs = fabs(b) / 10;
chi = squaresum( norm->records, a,b);
chi = squaresum( norm->records, a,b);
+ yaz_log(YLOG_LOG,"Initial done: it=%d: a=%f / %f b=%f / %f chi = %f",
+ 0, a, as, b, bs, chi );
while (it++ < maxiterations) // safeguard against things not converging
{
double aplus = squaresum(norm->records, a+as, b);
while (it++ < maxiterations) // safeguard against things not converging
{
double aplus = squaresum(norm->records, a+as, b);
@@
-269,7
+285,7
@@
static void normalize_scores(struct relevance *rel)
branch = "step b";
}
}
branch = "step b";
}
}
- yaz_log(YLOG_LOG,"Fitting %s it=%d: a=%f %f b=%f %f chi=%f ap=%f am=%f, bp=%f bm=%f p=%f",
+ yaz_log(YLOG_LOG,"Fitting %s it=%d: a=%g %g b=%g %g chi=%g ap=%g am=%g, bp=%g bm=%g p=%g",
branch, it, a, as, b, bs, chi,
aplus, aminus, bplus, bminus, prevchi );
norm->a = a;
branch, it, a, as, b, bs, chi,
aplus, aminus, bplus, bminus, prevchi );
norm->a = a;
@@
-280,12
+296,8
@@
static void normalize_scores(struct relevance *rel)
}
}
}
}
- yaz_log(YLOG_LOG,"Fitting done: it=%d: a=%f / %f b=%f / %f chi = %f",
+ yaz_log(YLOG_LOG,"Fitting done: it=%d: a=%g / %g b=%g / %g chi = %g",
it-1, a, as, b, bs, chi );
it-1, a, as, b, bs, chi );
- yaz_log(YLOG_LOG," a: %f < %f %d",
- fabs(as)*enough, fabs(a), (fabs(as) * enough < fabs(a)) );
- yaz_log(YLOG_LOG," b: %f < %f %d",
- fabs(bs)*enough, fabs(b), (fabs(bs) * enough < fabs(b)) );
}
if ( norm->scorefield != scorefield_none )
}
if ( norm->scorefield != scorefield_none )
@@
-295,14
+307,13
@@
static void normalize_scores(struct relevance *rel)
double r = nr->score;
r = norm->a * r + norm -> b;
nr->clust->relevance_score = 10000 * r;
double r = nr->score;
r = norm->a * r + norm -> b;
nr->clust->relevance_score = 10000 * r;
+ nr->record->score = r;
yaz_log(YLOG_LOG,"Normalized %f * %f + %f = %f",
nr->score, norm->a, norm->b, r );
// TODO - This keeps overwriting the cluster score in random order!
yaz_log(YLOG_LOG,"Normalized %f * %f + %f = %f",
nr->score, norm->a, norm->b, r );
// TODO - This keeps overwriting the cluster score in random order!
- // Need to merge results better
+ // Need to merge results better
}
}
-
}
}
-
} // client loop
}
} // client loop
}
@@
-635,7
+646,7
@@
void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
rel->doc_frequency_vec[i]);
}
}
rel->doc_frequency_vec[i]);
}
}
- // Calculate relevance for each document
+ // Calculate relevance for each document (cluster)
while (1)
{
int relevance = 0;
while (1)
{
int relevance = 0;
@@
-682,14
+693,15
@@
void relevance_prepare_read(struct relevance *rel, struct reclist *reclist)
// Build the normalizing structures
// List of (sub)records for each target
setup_norm_record( rel, rec );
// Build the normalizing structures
// List of (sub)records for each target
setup_norm_record( rel, rec );
-
- // TODO - Loop again, merge individual record scores into clusters
- // Can I reset the reclist, or can I leave and enter without race conditions?
-
+
} // cluster loop
normalize_scores(rel);
} // cluster loop
normalize_scores(rel);
-
+
+ // TODO - Calculate the cluster scores from individual records
+ // At the moment the record scoring puts one of them in the cluster...
+ reclist_rewind(reclist);
+
reclist_leave(reclist);
xfree(idfvec);
reclist_leave(reclist);
xfree(idfvec);