From: Heikki Levanto Date: Wed, 4 Dec 2013 11:58:56 +0000 (+0100) Subject: Merge branch 'ranking-h' of ssh://git.indexdata.com:222/home/git/pub/pazpar2 into... X-Git-Url: http://lists.indexdata.dk/?a=commitdiff_plain;h=4e7c5a2a359970dddb74662257b9946044d525a0;p=pazpar2-moved-to-github.git Merge branch 'ranking-h' of ssh://git.indexdata.com:222/home/git/pub/pazpar2 into ranking-h Conflicts: heikki/solr/test3.sh src/relevance.c Also fixed a detail with sorting of the score numbers --- 4e7c5a2a359970dddb74662257b9946044d525a0 diff --cc heikki/solr/test3.sh index 68bd69e,93c8886..649c79c --- a/heikki/solr/test3.sh +++ b/heikki/solr/test3.sh @@@ -97,13 -96,8 +97,15 @@@ echo "Client numbers cat scores.data | cut -d' ' -f2 | sort -u head -10 scores.data + exit 1 + +T1=`grep ": 1 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T2=`grep ": 2 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T3=`grep ": 3 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T4=`grep ": 4 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T5=`grep ": 5 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` +T6=`grep ": 6 " scores.data | head -1 | cut -d'#' -f2 | cut -d' ' -f2` + echo " set term png set out \"plot.png\" diff --cc src/relevance.c index e7f8585,ffa31b0..5450cae --- a/src/relevance.c +++ b/src/relevance.c @@@ -392,34 -366,14 +392,47 @@@ static const char *getfield(struct reco return ""; } +void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) +{ + int i; + + // Find the best record in a cluster - the one with lowest position + // (in this proto. Later, find a better one) + struct record *bestrecord = 0; + struct record *record; + struct normalizing *n; + float score; + for (record = cluster->records; record; record = record->next) + if ( bestrecord == 0 || bestrecord->position < record->position ) + bestrecord = record; + n = findnorm(r,bestrecord->client); + n->count ++; + score = atof( getfield(bestrecord,"score") ); + n->sum += score; + if ( n->max < score ) + n->max = score; + + for (i = 1; i < r->vec_len; i++) + if (cluster->term_frequency_vec[i] > 0) + r->doc_frequency_vec[i]++; + + r->doc_frequency_vec[0]++; +} + + + // Helper to compare floats, for qsort + static int sort_float(const void *x, const void *y) + { + const float *fx = x; + const float *fy = y; - return *fx - *fy; ++ //yaz_log(YLOG_LOG,"sorting %f and %f", *fx, *fy); // ### ++ if ( *fx > *fy ) ++ return 1; ++ if ( *fx < *fy ) ++ return -1; ++ return 0; // do not return *fx-*fy, it is often too close to zero. + } + // Prepare for a relevance-sorted read void relevance_prepare_read(struct relevance *rel, struct reclist *reclist, enum conf_sortkey_type type) @@@ -427,6 -381,12 +440,8 @@@ int i; float *idfvec = xmalloc(rel->vec_len * sizeof(float)); int n_clients = clients_count(); - struct client * clients[n_clients]; + int clusternumber = 0; + yaz_log(YLOG_LOG,"round-robin: have %d clients", n_clients); - for (i = 0; i < n_clients; i++) - clients[i] = 0; - reclist_enter(reclist); // Calculate document frequency vector for each term. @@@ -488,55 -449,79 +504,98 @@@ // get the log entries if (type == Metadata_sortkey_relevance_h) { struct record *record; - int thisclient = 0; + struct normalizing *norm; struct record *bestrecord = 0; int nclust = 0; - int tfrel = relevance; // keep the old tf/idf score; - int robinscore; - int solrscore; + int tfrel = relevance; // keep the old tf/idf score + int robinscore = 0; + int solrscore = 0; + int normscore; + const char *score; + const char *id; + const char *title; + char idbuf[64]; + int mergescore = 0; // Find the best record in a cluster - the one with lowest position for (record = rec->records; record; record = record->next) { if ( bestrecord == 0 || bestrecord->position < record->position ) bestrecord = record; nclust++; // and count them all, for logging } - // find the client number for the record (we only have a pointer - while ( clients[thisclient] != 0 - && clients[thisclient] != bestrecord->client ) - thisclient++; - if ( clients[thisclient] == 0 ) - { - yaz_log(YLOG_LOG,"round-robin: found new client at %d: p=%p\n", thisclient, bestrecord->client); - clients[thisclient] = bestrecord->client; - } + norm = findnorm(rel, bestrecord->client); // Calculate a round-robin score - robinscore = -(bestrecord->position * n_clients + thisclient) ; + robinscore = -(bestrecord->position * n_clients + norm->num) ; wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n", - bestrecord->position, thisclient, nclust, tfrel, relevance ); + bestrecord->position, norm->num, nclust, tfrel, relevance ); yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d", - bestrecord->position, thisclient, nclust, relevance ); + bestrecord->position, norm->num, nclust, relevance ); // Check if the record has a score field + score = getfield(bestrecord,"score"); ++ id = getfield(bestrecord, "id"); ++ title = getfield(bestrecord, "title"); + solrscore = 10000.0 * atof(score); - ++ // clear the id, we only want the first numerical part ++ i=0; ++ while( id[i] >= '0' && id[i] <= '9' ) { ++ idbuf[i] = id[i]; ++ i++; ++ } ++ idbuf[i] = '\0'; ++ if ( norm->count ) + { - const char *score = getfield(bestrecord,"score"); - const char *id = getfield(bestrecord, "id"); - const char *title = getfield(bestrecord, "title"); - // clear the id, we only want the first numerical part - char idbuf[64]; - solrscore = 10000.0 * atof(score); - i=0; - while( id[i] >= '0' && id[i] <= '9' ) { - idbuf[i] = id[i]; - i++; ++ //float avg = norm->sum / norm->count; ++ normscore = 10000.0 * ( atof(score) / norm->max ); ++ wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n", ++ score, norm->max, normscore); ++ } else ++ yaz_log(YLOG_LOG, "normscore: no count, can not normalize %s ", score ); ++ + // If we have a score in the best record, we probably have in them all + // and we can try to merge scores + if ( *score ) { + float scores[nclust]; + float s = 0.0; + int i=0; - for (record = rec->records; record; record = record->next, i++) { - scores[i] = atof( getfield(record,"score") ); - yaz_log(YLOG_LOG,"mergescore %d: %f", i, scores[i] ); - wrbuf_printf(w,"mergeplot %d: %f x\n", clusternumber, 10000*scores[i] ); ++ if ( rec->records && rec->records->next ) ++ { // have more than one record ++ for (record = rec->records; record; record = record->next, i++) ++ { ++ scores[i] = atof( getfield(record,"score") ); ++ yaz_log(YLOG_LOG,"mergescore %d: %f", i, scores[i] ); ++ wrbuf_printf(w,"mergeplot %d: %f x\n", clusternumber, 10000*scores[i] ); ++ } ++ qsort(scores, nclust, sizeof(float), sort_float ); ++ for (i = 0; icount ) - { - float avg = norm->sum / norm->count; - normscore = 10000.0 * ( atof(score) / norm->max ); - wrbuf_printf(w, "normscore: score(%s) / max(%f) *10000 = %d\n", - score, norm->max, normscore); - } else - yaz_log(YLOG_LOG, "normscore: no count, can not normalize %s ", score ); - - wrbuf_printf(w,"plotline: %d %d %d %d %d %d # %s %s\n", - norm->num, bestrecord->position, - tfrel, robinscore, solrscore, normscore, idbuf, title ); - qsort(scores, nclust, sizeof(float), sort_float ); - for (i = 0; iposition, but something from rec that + // corresponds to the hit number, for plotting. + } // merge score + id = getfield(bestrecord, "id"); + // clear the id, we only want the first numerical part + i=0; + while( id[i] >= '0' && id[i] <= '9' ) { + idbuf[i] = id[i]; + i++; } - relevance = normscore; // ### + idbuf[i] = '\0'; + + title = getfield(bestrecord, "title"); - wrbuf_printf(w,"plotline: %d %d %d %d %d %d # %s %s\n", - thisclient, bestrecord->position, - tfrel, robinscore, solrscore, mergescore, idbuf, title ); ++ wrbuf_printf(w,"plotline: %d %d %d %d %d %d %d # %s %s\n", ++ norm->num, bestrecord->position, ++ tfrel, robinscore, solrscore, normscore, mergescore, idbuf, title ); + relevance = mergescore; } rec->relevance_score = relevance; }