From: Heikki Levanto Date: Fri, 6 Dec 2013 15:18:25 +0000 (+0100) Subject: Real-life queries, and some scripts to analyze them X-Git-Url: http://lists.indexdata.dk/?a=commitdiff_plain;h=6f186956784fa4a1f6bdfd1411090e4545e1e873;p=pazpar2-moved-to-github.git Real-life queries, and some scripts to analyze them --- diff --git a/heikki/queries/process1.pl b/heikki/queries/process1.pl new file mode 100755 index 0000000..e0b372d --- /dev/null +++ b/heikki/queries/process1.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +# Analyzing DBC's example queries +# Step 1: Split the file, merge nexpage queries + + +open F, "searches.csv" or die "could not open searches.csv: $!\n"; + +open OUT, ">x1" or die "could not open x1 for writing: $!\n"; +print OUT "#query ; hits; pages\n"; + +my $thisline = ""; +my $pagecount = 1; # how many next-page searches (same query) +while ( ) { + next unless /^[0-9]/; + my ( $lineno, $query, $ccl, $hits, $timing ) = split (';'); + if ( $thisline eq $lineno ){ + $pagecount ++; + } else { + print OUT "$query ; $hits ; $pagecount\n"; + #print "$query ; $hits ; $pagecount\n"; + $pagecount = 1; + $thisline = $lineno; + } + + #die "STOPPING EARLY " if $lineno > 5000; +} diff --git a/heikki/queries/process2.pl b/heikki/queries/process2.pl new file mode 100755 index 0000000..1a0c495 --- /dev/null +++ b/heikki/queries/process2.pl @@ -0,0 +1,105 @@ +#!/usr/bin/perl -w +# Analyzing DBC's example queries +# Step 2: Count identical queries, and query types +# Assumes x2 is the result of process1.pl, sorted alphabetically +# + + +open F, "x2" or die "could not open x2: $!\n"; + +open OUT, "| sort -n -r > x3" or die "could not open sort>x3 for writing: $!\n"; +print OUT "#count ; query \n"; + +my $thisquery = ""; +my $count = 1; # how many times seen the same query +my $totalqueries = 0; +my $booleans = 0; # queries that contain a boolean operator +my $fields = 0; # queries that start with field=, f.ex. title or author +my $multifields = 0; # queries that contain more than one field +my $simplequeries = 0; # queries without booleans or fields +my $uniquequeries = 0; +my $singlehits = 0; +my $singleterms = 0; +my $doubleterms = 0; +my $tripleterms = 0; +my $manyterms = 0; +my %field; +while ( ) { + next if /^#/; + my ( $query, $hits, $pages ) = split (';'); + next if ( $hits =~ /["a-z]/); # semicolons in original query, ignore + if ( $thisquery eq $query ){ + $count ++; + } else { + print OUT "$count ; $query \n"; + $totalqueries += $count; + $uniquequeries += 1; + $singlehits += 1 if ($hits <= 1 ); + my $is_simple = 1; + my $is_boolean = 0; + if ( / og /i || / eller /i || / ikke /i ) { + $booleans++; + $is_simple = 0; + $is_boolean = 1; + } + if ( $query =~ /^[("]*([^=; \t]+)=(.*)/ ) { + my ( $k, $v ) = ( $1, $2); + $fields += $count; + $is_simple = 0; + $multifields += $count if ( $v =~ /=/ ); # multiple fields mentioned + if ( $v =~ /=/ && !$is_boolean ) { + print "OOPS: $query\n"; + } + $k = lc($k); + $field{$k} = 0 unless defined($field{$k}); + $field{$k} += $count; + } # field query + if ($is_simple) { + $simplequeries += $count; + } + my $q2 = $query; + $q2 =~ s/\S+=//; # remove fields + $q2 =~ s/ og //; + $q2 =~ s/ eller //; + $q2 =~ s/ ikke //; + $singleterms += $count if ($q2 =~ /^\s*\S+\s*$/ ); + $doubleterms += $count if ($q2 =~ /^\s*\S+\s+\S+\s*$/ ); + $tripleterms += $count if ($q2 =~ /^\s*\S+\s+\S+\s+\S+\s*$/ ); + $manyterms += $count if ($q2 =~ /^\s*\S+\s+\S+\s+\S+\s+\S+/ ); + #print "$query ; $hits ; $pagecount\n"; + $count = 1; + $thisquery = $query; + } +} +close OUT; + +open OUT, ">x4" or die "could not open x4 for writing summary: $!\n"; + +sub line { + my ($capt, $number, $dopercent ) = @_; + my $percents = ""; + if ( $dopercent ) { + $percents = "". int( $number*1000 / $totalqueries ) / 10 . "%" ; + } + while (length($percents) < 6 ) { $percents = " $percents"; } + print OUT sprintf("%-20s %7d %s\n", $capt, $number, $percents); +} + +line "Total queries", $totalqueries, 0; +line "Unique queries", $uniquequeries, 1; + +line "Boolean queries", $booleans, 1; +line "Fielded queries", $fields, 1; +line "Multiple fields", $multifields, 1; +line "Simple queries", $simplequeries, 1; +line "One-hit queries", $singlehits, 1; +print OUT "\n"; +line "Single term", $singleterms, 1; +line "Double terms", $doubleterms, 1; +line "Triple terms", $tripleterms, 1; +line "Many terms", $manyterms, 1; + +print OUT "\nFields\n"; +for my $k (sort{ $field{$b} <=> $field{$a} } keys(%field) ) { + line " $k", $field{$k}, 1 if ($field{$k} > 100); +} \ No newline at end of file diff --git a/heikki/queries/searches.csv.bz2 b/heikki/queries/searches.csv.bz2 new file mode 100644 index 0000000..96bb52a Binary files /dev/null and b/heikki/queries/searches.csv.bz2 differ