From c5fd6cd53303b4b838e0c316113d446c7dbe0ed4 Mon Sep 17 00:00:00 2001 From: Heikki Levanto Date: Thu, 12 Dec 2013 15:39:16 +0100 Subject: [PATCH] test of least-square normalizing just a perl script to normalize the numbers I got from the other tests, and to plot. Run ./fit.po *.in for a nice plot --- heikki/fitting/fit.pl | 137 +++++++++++++++++++++++++++++++++++++++++ heikki/fitting/os-potter.in | 99 +++++++++++++++++++++++++++++ heikki/fitting/os-water_or.in | 101 ++++++++++++++++++++++++++++++ heikki/fitting/os-zen.in | 9 +++ heikki/fitting/primo1.in | 101 ++++++++++++++++++++++++++++++ heikki/fitting/primo2.in | 101 ++++++++++++++++++++++++++++++ heikki/fitting/primo3.in | 101 ++++++++++++++++++++++++++++++ heikki/fitting/solr.4.in | 60 ++++++++++++++++++ heikki/fitting/solr.5.in | 20 ++++++ heikki/fitting/solr.6.in | 23 +++++++ src/relevance.c | 4 +- 11 files changed, 754 insertions(+), 2 deletions(-) create mode 100755 heikki/fitting/fit.pl create mode 100644 heikki/fitting/os-potter.in create mode 100644 heikki/fitting/os-water_or.in create mode 100644 heikki/fitting/os-zen.in create mode 100644 heikki/fitting/primo1.in create mode 100644 heikki/fitting/primo2.in create mode 100644 heikki/fitting/primo3.in create mode 100644 heikki/fitting/solr.4.in create mode 100644 heikki/fitting/solr.5.in create mode 100644 heikki/fitting/solr.6.in diff --git a/heikki/fitting/fit.pl b/heikki/fitting/fit.pl new file mode 100755 index 0000000..913eca2 --- /dev/null +++ b/heikki/fitting/fit.pl @@ -0,0 +1,137 @@ +#!/usr/bin/perl -w +# fit.c - experiments in curve fitting +# for pazpar'2 ranking normalizing + +# We have a number of data points ( position, score) from +# different sources. The task is to normalize them so that +# they all fall near the curve y=1/p, where p is the position +# This is done by adjusting the ranks R so that Rn = aR+b +# We need to find parameters a,b so as to minimize the chi- +# squared difference from y=1/p + + +my $plotnr = 1; # number the tmp files for plotting +my $plotcmd = ""; # the plot commands for gnuplot + +# Calculate the (squared) difference from the normalized rank to the 1/n function +# Params +# p = position (x) +# r = rank, not normalized +# a,b normalizing params +sub diff { + my ( $p, $r, $a, $b ) = @_; + my $rn = $r * $a + $b; + my $f = 1.0 / $p; # target value + my $d = $rn - $f; + return $d * $d; +} + +# Read and process one data file +# Just one float number per line, nothing else +sub onefile { + my $fn = shift; + my @d; + open F, $fn or die "Could not open $fn: $!\n"; + my $n = 1; # number of data points + my $first; + my $last; + my $title; + while ( ){ + chomp(); + $title = $_ unless defined($title); + next unless /^[0-9]/; # skip comments etc + my $v = 1.0 * $_ ; + $first = $v unless defined($first); + $last = $v; + #print "Data $n is $v\n"; + $d[$n++] = $v; + } + $title =~ s/^[# ]+//; # clean the '#' and leading space + print "$fn: '$title' $n points: $first - $last \n"; + # Initial guess Rn = a*R + b + my $a = 1.0 / $first; + my $b = - $last; + # step sizes for a and b + my $da = $a / 3; + my $db = - $b / 3; + my $iteration = 0; + my $prev = 0.0; + while (1) { + $iteration++; + # 5 sums: at (a,b) (a+,b), (a-,b), (a,b+), (a,b-) + my $sab = 0.0; # at a,b + my $sap = 0.0; # at a+da,b + my $sam = 0.0; # at a-da,n + my $sbp = 0.0; # at a, b+db + my $sbm = 0.0; # at a, b-db + for ( my $p = 1 ; $p < $n; $p++ ) { + $sab += diff( $p, $d[$p], $a, $b ); + $sap += diff( $p, $d[$p], $a+$da, $b ); + $sam += diff( $p, $d[$p], $a-$da, $b ); + $sbp += diff( $p, $d[$p], $a, $b+$db ); + $sbm += diff( $p, $d[$p], $a, $b-$db ); + } + my $dif = $sab - $prev; + #print "iteration $iteration: a=$a +- $da b=$b +- $db chisq=$sab dif=$dif\n"; + if ( (abs($da) < abs($a)/100.0 && abs($db) < abs($b)/100.0) || + ($iteration >= 100 ) || + (abs($dif) < 0.00001 ) ) { + print "it-$iteration: a=$a +- $da b=$b +- $db chisq=$sab dif=$dif\n"; + last; + } + $prev = $sab; + # adjust a + if ( $sap < $sab && $sap < $sam ) { + $a += $da; + } elsif ( $sam < $sab && $sam < $sap ) { + $a -= $da; + } else { + $da = $da /2; + } + $da = $da * 0.99; + # adjust b + if ( $sbp < $sab && $sbp < $sbm ) { + $b += $db; + } elsif ( $sbm < $sab && $sbm < $sbp ) { + $b -= $db; + } else { + $db = $db /2; + } + $db = $db * 0.99; + } + + # plot the file + my $pf = "/tmp/plot.$plotnr.data"; + $plotnr++; + open PF, ">$pf" or die "Could not open plot file $pf: $!\n"; + for ( my $p = 1 ; $p < $n; $p++ ) { + my $rn = $d[$p] * $a + $b; + print PF "$p $rn\n"; + } + close PF; + $plotcmd .= "," if ($plotcmd); + $plotcmd .= "\"$pf\" using 1:2 with points title \"$title\""; + + + +} + +# main + +if ( !defined($ARGV[0]) ) { + die "Need at least one file to plot\n"; +} +while ($ARGV[0]) { + onefile( $ARGV[0] ); + shift(@ARGV); +} +my $cmd = + "set term png\n" . + "set out \"plot.png\" \n" . + "plot $plotcmd \n"; + +print "$cmd \n"; + +open GP, "| gnuplot" or die "Could not open a pipe to gnuplot: $!\n"; +print GP $cmd; +close GP; \ No newline at end of file diff --git a/heikki/fitting/os-potter.in b/heikki/fitting/os-potter.in new file mode 100644 index 0000000..fc77330 --- /dev/null +++ b/heikki/fitting/os-potter.in @@ -0,0 +1,99 @@ +# OpenSearch: Harry Potter +35632 +6386 +39669 +62696 +62696 +32809 +32809 +39669 +55836 +39669 +55836 +62696 +41044 +39544 +41976 +49630 +49630 +50795 +6043 +34662 +34506 +14020 +6825 +6825 +34506 +11982 +12767 +27727 +2452 +11077 +31873 +32809 +30702 +35632 +35632 +1252 +58113 +16620 +24931 +37956 +34031 +38090 +5895 +32809 +39669 +32809 +39669 +30702 +35632 +6825 +13456 +0 +3021 +37548 +11876 +45461 +43659 +10559 +5538 +6386 +13285 +34762 +34762 +37584 +0 +59435 +40863 +41406 +37300 +32439 +32370 +0 +63142 +11535 +47107 +28198 +50795 +20776 +32809 +29717 +32809 +27727 +35353 +10885 +30702 +30756 +27796 +27727 +34363 +37369 +32439 +30794 +36301 +37369 +104072 +13650 +5767 +0 diff --git a/heikki/fitting/os-water_or.in b/heikki/fitting/os-water_or.in new file mode 100644 index 0000000..21003d0 --- /dev/null +++ b/heikki/fitting/os-water_or.in @@ -0,0 +1,101 @@ +# OpenSearch: Water or Fire or Ice +1072620 +1072620 +227252 +953170 +0 +190130 +132687 +357539 +41227 +653127 +182406 +0 +342992 +186997 +32487 +158852 +265375 +103032 +190130 +295437 +244746 +357539 +124766 +504118 +0 +158861 +254715 +0 +0 +0 +0 +23663 +11977 +11977 +286400 +26359 +154715 +25311 +0 +82271 +126715 +91603 +286400 +254715 +38714 +38714 +25311 +82747 +0 +278475 +91514 +161559 +161559 +161559 +161559 +161559 +0 +293315 +91603 +20613 +21422 +20982 +27976 +73574 +86354 +64764 +225581 +0 +35025 +138314 +138314 +25311 +108300 +238359 +253717 +0 +32468 +0 +0 +0 +0 +0 +16234 +0 +0 +26359 +0 +50624 +284612 +12720 +13988 +0 +0 +59792 +0 +0 +0 +83929 +0 +0 diff --git a/heikki/fitting/os-zen.in b/heikki/fitting/os-zen.in new file mode 100644 index 0000000..559faff --- /dev/null +++ b/heikki/fitting/os-zen.in @@ -0,0 +1,9 @@ +# OpenSearch: Zen and motorcycle +949202 +413772 +59799 +105466 +17462 +64071 +0 +0 diff --git a/heikki/fitting/primo1.in b/heikki/fitting/primo1.in new file mode 100644 index 0000000..667c1f9 --- /dev/null +++ b/heikki/fitting/primo1.in @@ -0,0 +1,101 @@ +#primo-1 +0.20756114 +0.13844302 +0.10148811 +0.10148811 +0.0888021 +0.0888021 +0.0888021 +0.0888021 +0.0888021 +0.0888021 +0.0888021 +0.0888021 +0.06343007 +0.06343007 +0.06343007 +0.054605015 +0.050744057 +0.050744057 +0.04440105 +0.04440105 +0.04440105 +0.04440105 +0.04288424 +0.038058043 +0.038058043 +0.038058043 +0.038058043 +0.038058043 +0.033645514 +0.032335546 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.031715035 +0.03139628 +0.03019823 +0.025889121 +0.02537203 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 +0.025372028 diff --git a/heikki/fitting/primo2.in b/heikki/fitting/primo2.in new file mode 100644 index 0000000..7ef4dde --- /dev/null +++ b/heikki/fitting/primo2.in @@ -0,0 +1,101 @@ +#primo-2 +0.20761509 +0.02813717 +0.013241022 +0.013241022 +0.011585894 +0.011585894 +0.011585894 +0.011585894 +0.011585894 +0.011585894 +0.011585894 +0.011585894 +0.009792839 +0.008275638 +0.008275638 +0.008275638 +0.008137711 +0.0077239294 +0.0077239294 +0.0077239294 +0.0066707116 +0.006620511 +0.006620511 +0.00641362 +0.0060688015 +0.005792947 +0.005792947 +0.005792947 +0.005792947 +0.005792947 +0.0053102016 +0.005189515 +0.004965385 +0.0049653836 +0.0049653836 +0.0049653836 +0.0049653836 +0.004827456 +0.004827456 +0.004787209 +0.004758492 +0.004593435 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004137819 +0.004096232 +0.0039643953 +0.0038964467 +0.0038964467 +0.0038921367 +0.003827483 +0.003793001 +0.0037412783 +0.0036817277 +0.0035235784 +0.0034085026 +0.003379219 +0.003379219 +0.0033444054 +0.0033102573 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 +0.0033102555 diff --git a/heikki/fitting/primo3.in b/heikki/fitting/primo3.in new file mode 100644 index 0000000..9b80bf6 --- /dev/null +++ b/heikki/fitting/primo3.in @@ -0,0 +1,101 @@ +#primo-3 +0.9688704 +0.48564208 +0.4844352 +0.34602517 +0.24282716 +0.2422176 +0.2422176 +0.17301258 +0.13841006 +0.13841006 +0.13841006 +0.08650733 +0.06920503 +0.0605544 +0.0605544 +0.04555319 +0.043436013 +0.04338245 +0.043360904 +0.043360904 +0.043360904 +0.043360904 +0.043360904 +0.04333935 +0.04333935 +0.04329782 +0.04328929 +0.043283623 +0.036257647 +0.034602515 +0.034602515 +0.034602515 +0.032769855 +0.031989623 +0.030320304 +0.0302772 +0.0302772 +0.0302772 +0.0302772 +0.0302772 +0.0302772 +0.0302772 +0.0302772 +0.0302772 +0.0302772 +0.0302772 +0.0302772 +0.030199146 +0.029892702 +0.02957932 +0.028755857 +0.026976993 +0.026775088 +0.026775088 +0.026708232 +0.026482046 +0.026482044 +0.026206192 +0.026116762 +0.025731273 +0.025316 +0.024804201 +0.024745526 +0.024551062 +0.024409074 +0.024283035 +0.024211751 +0.023766499 +0.023736173 +0.023624163 +0.023554381 +0.023495784 +0.023482127 +0.02325794 +0.023171788 +0.023171788 +0.023115499 +0.02302765 +0.022964898 +0.022964898 +0.022910973 +0.02275256 +0.02262008 +0.022522116 +0.022344224 +0.022237735 +0.022181446 +0.022108069 +0.022106145 +0.02174285 +0.021742849 +0.021723554 +0.021723554 +0.021666314 +0.021626573 +0.021626573 +0.021626573 +0.021568384 +0.021453962 +0.021378037 diff --git a/heikki/fitting/solr.4.in b/heikki/fitting/solr.4.in new file mode 100644 index 0000000..3838e38 --- /dev/null +++ b/heikki/fitting/solr.4.in @@ -0,0 +1,60 @@ +#solr-4 +23010 +21476 +21256 +21089 +20581 +20351 +20351 +20351 +20351 +20351 +20089 +20089 +20089 +19555 +19525 +19525 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19404 +19358 +19358 +19358 +18975 +18975 +18975 +18975 +18975 +18975 +18975 +18975 +18975 +18975 +18975 +18975 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 diff --git a/heikki/fitting/solr.5.in b/heikki/fitting/solr.5.in new file mode 100644 index 0000000..219b7fe --- /dev/null +++ b/heikki/fitting/solr.5.in @@ -0,0 +1,20 @@ +#solr-5 +21694 +20581 +20581 +20581 +20581 +20351 +20293 +20293 +20293 +20293 +20293 +20293 +20293 +20293 +20293 +20293 +19404 +19404 +19404 diff --git a/heikki/fitting/solr.6.in b/heikki/fitting/solr.6.in new file mode 100644 index 0000000..2925d58 --- /dev/null +++ b/heikki/fitting/solr.6.in @@ -0,0 +1,23 @@ +#solr-6 +22137 +20581 +20119 +19525 +19358 +19358 +19358 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 +18788 diff --git a/src/relevance.c b/src/relevance.c index 1f3eb28..377cdb7 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -53,7 +53,7 @@ struct relevance // Structure to keep data for normalizing scores from one client struct normalizing { - int num; + int num; // number of the client float sum; float max; int count; @@ -612,7 +612,7 @@ void relevance_prepare_read(struct relevance *rel, struct reclist *reclist, wrbuf_printf(w,"plotline: %d %d %d %d %d %d %d # %s %s\n", norm->num, bestrecord->position, tfrel, robinscore, solrscore, normscore, mergescore, idbuf, title ); - relevance = normscore; + relevance = solrscore; } rec->relevance_score = relevance; } -- 1.7.10.4