From 77b2ebc9e0758617813da61ff88318e4c24205ac Mon Sep 17 00:00:00 2001 From: Heikki Levanto Date: Thu, 28 Nov 2013 16:19:11 +0100 Subject: [PATCH] Tests on SOLR, plottable data out of pazpar2, plot script --- heikki/README-HEIKKI | 10 ++++++++++ heikki/solr/run.sh | 33 +++++++++++++++++++++++++++++++++ heikki/solr/solr.lui.xml | 31 ++++++++++++++++++++++++++++--- heikki/solr/test3.cfg | 11 +++++++++-- heikki/solr/test3.sh | 38 ++++++++++++++++++++++++++++++++++---- src/relevance.c | 41 ++++++++++++++++++++++++++++++++++++++--- 6 files changed, 152 insertions(+), 12 deletions(-) create mode 100755 heikki/solr/run.sh diff --git a/heikki/README-HEIKKI b/heikki/README-HEIKKI index 0571886..172ff99 100644 --- a/heikki/README-HEIKKI +++ b/heikki/README-HEIKKI @@ -62,3 +62,13 @@ Add this to the target defs After this, it should be possible to get records from different databases, some with many records, some with a few. This is a good testing ground for merging rankings! Test first with a round-robin, and plot the scores. + +Thu 28-Nov +Ok, I can now merge a number of SOLR databases (harvest jobs), and plot their rankings +as solr gives them, in the order of different merge strategies +Next: Add the normalizing merge strategy. Then plot different strategies against different queries +Write a conclusion, and consider this plotting job done + + + + diff --git a/heikki/solr/run.sh b/heikki/solr/run.sh new file mode 100755 index 0000000..ea40894 --- /dev/null +++ b/heikki/solr/run.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# Run the test with a number of queries, plot the results +# + +if [ "$1" == "" ] +then + echo "Need an argument, the name of this test run" + echo "It will be in the title of all plots, together with the query" + exit 1 +fi +TITLE="$1" +OUTFILE=`echo $1.txt | sed 's/ /_/g'` +echo "$TITLE" > $OUTFILE +./test3.sh clean + +function onerun() { + QRY="$1" + echo "" >> $OUTFILE + echo "Query: $QRY" >> $OUTFILE + PNG=`echo "solr_$TITLE $QRY.png" | sed 's/ /_/g' ` + echo "Graph: $PNG" >> $OUTFILE + ./test3.sh "$QRY" "$TITLE" + grep "plotline" show.out | head -10 >> $OUTFILE + cp plot.png $PNG +} + +onerun "harry potter" +onerun "vietnam war" +onerun "water or fire or ice" +echo "" >> $OUTFILE +echo "client#, position, tf/idf, roundrobin, solr # database # title" >> $OUTFILE + diff --git a/heikki/solr/solr.lui.xml b/heikki/solr/solr.lui.xml index 4e5905d..a016e82 100644 --- a/heikki/solr/solr.lui.xml +++ b/heikki/solr/solr.lui.xml @@ -1,8 +1,33 @@ - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/heikki/solr/test3.cfg b/heikki/solr/test3.cfg index 7816b85..f0ed3c0 100644 --- a/heikki/solr/test3.cfg +++ b/heikki/solr/test3.cfg @@ -7,9 +7,13 @@ - + + + + + @@ -37,6 +41,7 @@ + @@ -117,7 +122,9 @@ - + + + diff --git a/heikki/solr/test3.sh b/heikki/solr/test3.sh index f40845d..741a722 100755 --- a/heikki/solr/test3.sh +++ b/heikki/solr/test3.sh @@ -42,9 +42,18 @@ then else Q=$1 fi + +if [ -z "$2" ] +then + HEADLINE="$Q" +else + HEADLINE="$2: $Q" +fi + QRY=`echo $Q | sed 's/ /+/g' ` -SORT="sort=score" +#SORT="sort=score" +SORT="sort=relevance_h" #SEARCH="command=search$SES&$QRY&rank=1&sort=relevance" #SEARCH="command=search$SES&$QRY" #SEARCH="command=search$SES&query=$QRY&sort=relevance" @@ -80,9 +89,30 @@ echo $SHOW curl -s "http://localhost:9017/?$SHOW" > show.out #grep "relevance" show.out | grep += | grep -v "(0)" #grep "round-robin" show.out -grep '^ ' show.out | head -11 -grep 'Received' dbc-opensearch-gw.log | head -1 >> titles.out -grep '^ ' show.out >> titles.out + +# Plot the lines created by the code +grep plotline show.out > scores.data +echo "Client numbers" +cat scores.data | cut -d' ' -f2 | sort -u +head -10 scores.data + +echo " + set term png + set out \"plot.png\" + set title \"$HEADLINE\" +" > plot.cmd +echo ' + plot "scores.data" using 0:($2==0?$6:1/0) with points title "db-1", \ + "scores.data" using 0:($2==1?$6:1/0) with points title "db-2", \ + "scores.data" using 0:($2==2?$6:1/0) with points title "db-3", \ + "scores.data" using 0:($2==3?$6:1/0) with points title "db-4", \ + "scores.data" using 0:($2==4?$6:1/0) with points title "db-5", \ + "scores.data" using 0:($2==5?$6:1/0) with points title "db-6" \ +' >> plot.cmd +cat plot.cmd | gnuplot + + +exit 1 # The old plotting code # Plot it DF=`echo $QRY | sed 's/@//g' | sed 's/[+"]/_/g' | sed s"/'//g "` diff --git a/src/relevance.c b/src/relevance.c index 2e5411b..5284686 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -353,6 +353,19 @@ void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) r->doc_frequency_vec[0]++; } +static const char *getfield(struct record *bestrecord, const char *tag) +{ + struct session *se = client_get_session(bestrecord->client); + int md_field_id = conf_service_metadata_field_id(se->service, tag); + struct record_metadata *md = 0; + if (md_field_id <0) + return ""; + md = bestrecord->metadata[md_field_id]; + if ( md) + return md->data.text.disp; + return ""; +} + // Prepare for a relevance-sorted read void relevance_prepare_read(struct relevance *rel, struct reclist *reclist, enum conf_sortkey_type type) @@ -429,11 +442,13 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist, int thisclient = 0; struct record *bestrecord = 0; int nclust = 0; + // Find the best record in a cluster - the one with lowest position for (record = rec->records; record; record = record->next) { if ( bestrecord == 0 || bestrecord->position < record->position ) bestrecord = record; - nclust++; + nclust++; // and count them all, for logging } + // find the client number for the record (we only have a pointer while ( clients[thisclient] != 0 && clients[thisclient] != bestrecord->client ) thisclient++; @@ -442,12 +457,32 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist, yaz_log(YLOG_LOG,"round-robin: found new client at %d: p=%p\n", thisclient, bestrecord->client); clients[thisclient] = bestrecord->client; } - int tfrel = relevance; - relevance = -(bestrecord->position * n_clients + thisclient) ; + // Calculate a round-robin score + int tfrel = relevance; // keep the old tf/idf score + int robinscore = -(bestrecord->position * n_clients + thisclient) ; wrbuf_printf(w,"round-robin score: pos=%d client=%d ncl=%d tfscore=%d score=%d\n", bestrecord->position, thisclient, nclust, tfrel, relevance ); yaz_log(YLOG_LOG,"round-robin score: pos=%d client=%d ncl=%d score=%d", bestrecord->position, thisclient, nclust, relevance ); + + // Check if the record has a score field + const char *score = getfield(bestrecord,"score"); + int solrscore = 10000.0 * atof(score); + const char *id = getfield(bestrecord, "id"); + // clear the id, we only want the first numerical part + char idbuf[64]; + i=0; + while( id[i] >= '0' && id[i] <= '9' ) { + idbuf[i] = id[i]; + i++; + } + idbuf[i] = '\0'; + + const char *title = getfield(bestrecord, "title"); + wrbuf_printf(w,"plotline: %d %d %d %d %d # %s %s\n", + thisclient, bestrecord->position, + tfrel, robinscore, solrscore, idbuf, title ); + relevance = solrscore; } rec->relevance_score = relevance; } -- 1.7.10.4