From 69074dfdd933ea0b31824bd341266329fbb34212 Mon Sep 17 00:00:00 2001 From: Heikki Levanto Date: Tue, 10 Dec 2013 13:39:31 +0100 Subject: [PATCH] More analysis --- heikki/queries/process2.pl | 2 +- heikki/queries/process3.pl | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100755 heikki/queries/process3.pl diff --git a/heikki/queries/process2.pl b/heikki/queries/process2.pl index 27ac5b5..7f4d014 100755 --- a/heikki/queries/process2.pl +++ b/heikki/queries/process2.pl @@ -31,7 +31,7 @@ while ( ) { if ( $thisquery eq $query ){ $count ++; } else { - print OUT "$count ; $query \n"; + print OUT "$count ; $thisquery \n"; $totalqueries += $count; $uniquequeries += 1; $singlehits += 1 if ($hits <= 1 ); diff --git a/heikki/queries/process3.pl b/heikki/queries/process3.pl new file mode 100755 index 0000000..54651a2 --- /dev/null +++ b/heikki/queries/process3.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl -w +# Analyzing DBC's example queries +# Step 3: Eliminate search terms +# Assumes x3 is the result of process2.pl +# Result should be sorted and passed to the next step + + +open F, "x3" or die "could not open x3: $!\n"; + +my $linecount = 0; +my %counts; +while ( ) { + next if /^#/; + chomp(); + $linecount ++; + #last if ($linecount >10); + my ( $hits, $query) = split (';'); + $query =~ s/^ +//; + $query =~ s/ +$//; + #print "$_ : '$query'\n"; + my $nq = ""; + for my $t ( split(' ',$query) ) { + #print " '$t'\n"; + if ( $t ne "og" && $t ne "eller" && $t ne "ikke" ) { + $t =~ s/[^ =]+/x/g; + } + $nq .= " " if ($nq); + $nq .= $t; + } + $counts{$nq} = 0 unless defined($counts{$nq}); + $counts{$nq} += $hits; + print "$nq: $hits $counts{$nq}\n"; +} +close F; + +open OUT, ">x5" or die "could not open sort>x5 for writing: $!\n"; +my $thisq = ""; +my $sum = 0; +for my $q ( sort { $counts{$b} <=> $counts{$a} } keys(%counts) ) { + print "q='$q' n=$counts{$q} \n"; + print OUT "$counts{$q}; $q\n"; +} +close OUT; -- 1.7.10.4