#!/usr/local/bin/perl # gaps.cgi # GAPS: Google API Proximity Search # Searches Google for two terms near each other, # using successive wildcard searches. # See http://www.staggernation.com/gaps/readme.html for info. # By Kevin Shay, kevin [AT] staggernation [DOT] com # Version 1.1, 8/6/02 use strict; # tell perl where to find SOAP::Lite use lib qw( /home/staggernation/lib/5.6.0/ /home/staggernation/lib/site_perl/5.6.0); use vars qw($this_script); use SOAP::Lite; use CGI; use URI::Escape; # this library contains generic code for all the Google API scripts require('ga_lib.pl'); # constant declarations: # the URL of this CGI script, so we can target the form there $this_script = $ENV{'SERVER_URL'} . $ENV{'SCRIPT_NAME'}; my $docs_path = 'http://www.staggernation.com/gaps'; # to use your own Google key, un-comment the line below and insert # your key between the quotes... #my $google_key = 'YOUR_KEY_HERE'; # ...and comment out this line: my $google_key = require('key.pl'); # the maximum distance between terms; # this is arbitrarily set low so we don't hit the # 1000-query daily limit too quickly my $max_distance = 3; # get the data that was passed to the form my $cgi = new CGI; # put it into a hash for easy access my $params = cgi_params($cgi); # this hash will keep track of Latin-UTF8 character mappings, # so they don't have to be recalculated repeatedly after the # first time each one is encountered my %utf8_cache = (); # variables we'll assign for output on the HTML page my $html_output = ''; my $html_result_head = ''; my $page_title = ''; my $err = ''; print "Content-type: text/html\n\n"; # keep a copy of the query terms in their original state, because we'll # be UTF8-encoding the terms @{$params}{qw(orig_term1 orig_term2)} = @{$params}{qw(term1 term2)}; # user can enter their own key in the form $google_key = ($params->{'key'} || $google_key); # parameter validation and defaults: $params->{'n'} = 1 unless (($params->{'n'} >= 1) && ($params->{'n'} <= $max_distance)); $params->{'sort'} = 'title' unless ($params->{'sort'} =~ /^title|URL|ranking|proximity$/); $params->{'order'} = 'that' unless ($params->{'order'} =~ /^that|either$/); $params->{'per_query'} = 10 unless (($params->{'per_query'} >= 1) && ($params->{'per_query'} <= 10)); $params->{'limit_total'} = 0 unless (($params->{'limit_total'} >= 1) && ($params->{'limit_total'} <= 50)); $params->{'filter'} = ($params->{'filter'} eq 'on') ? 1 : 0; # we need two terms entered to proceed if ($params->{'term1'} && $params->{'term2'}) { for (qw(term1 term2)) { # strip any quotation marks, since we'll be enclosing each term $params->{$_} =~ s/"//g; # encode foreign characters with UTF8 encoding $params->{$_} = latin_to_utf8($params->{$_}, \%utf8_cache); } # load the Google service definition my $service = SOAP::Lite->service("http://api.google.com/GoogleSearch.wsdl"); # this will hold results from all queries; key depends on sort option my %results = (); # keep track of all result URLs to avoid showing duplicates my %urls_seen = (); # the result fields we're interested in my @fields = qw(URL title snippet cachedSize); # pointer to the sort routine that will be used to sort results # (numeric or alphabetical depending on which option user chose) my $sort_fn; if (($params->{'sort'} eq 'ranking') || ($params->{'sort'} eq 'proximity')) { $sort_fn = sub {$a <=> $b}; # numeric sort } else { # URL or title $sort_fn = sub {$a cmp $b}; # alpha sort } # break to here if search returns an error OUTER: # loop from zero to "within N words" value, # constructing a wildcard search (or two) at each level. # eg. for # fine within 2 words of dandy, in that order # we'd do the queries: # "fine dandy" # "fine * dandy" # "fine * * dandy" for my $level (0 .. $params->{'n'}) { # two possible queries for each level: one with term1 first, # one with term2 first (if user chose "either order") my ($q1, $q2); my $wildcards = '* ' x $level; $q1 = '"' . "$params->{'term1'} $wildcards$params->{'term2'}" . '"'; if ($params->{'order'} eq 'either') { $q2 = '"' . "params->{'term2'} $wildcards$params->{'term1'}" . '"'; } my $t = 0; # so we can tell whether we're at q1 or q2 for ($q1, $q2) { next unless $_; $t++; # append additional search terms $_ .= ' ' . $params->{'additional'} if ($params->{'additional'}); my $response = google_search($service, $google_key, $_, 0, $params->{'per_query'}, $params->{'filter'}); # service fault (i.e. bad license key): bail out of loops if ($response->{'err'}) { $err = $response->{'err'}; last OUTER; } # track total number of queries we've done $params->{'count_queries'}++; # skip to next iteration if no results next unless (defined $response->{'resultElements'} && @{$response->{'resultElements'}}); # accumulate total result count from all queries $params->{'count_total'} += $response->{'estimatedTotalResultsCount'}; # rank is each result's position within this query's set of results my $rank = 0; # loop through results for my $result (@{$response->{'resultElements'}}) { $rank++; # for sorting: reverse-order result should be ranked # immediately after its counterpart at this proximity level my $sort_rank = ($t == 2) ? ($rank + 0.5) : $rank; # don't duplicate results we've already come across next if ($urls_seen{$result->{'URL'}}); $urls_seen{$result->{'URL'}} = 1; $params->{'count_results'}++; $result->{'title'} = '[untitled]' unless ($result->{'title'}); # now asign the sort key based on which sort option the user chose my $key = ''; if ($params->{'sort'} eq 'ranking') { $key = ($sort_rank * 100) + $level; } elsif ($params->{'sort'} eq 'proximity') { $key = ($level * 100) + $sort_rank; } else { # URL or title $key = $result->{$params->{'sort'}} . (($sort_rank * 100) + $level); # add rank to avoid duplicate keys } # stick the fields we need into the results hash @{$results{$key}}{@fields} = @{$result}{@fields}; $results{$key}{'proximity'} = $level; # keep track of which query produced this result # so we can link to it from the listing $results{$key}{'query'} = $response->{'searchQuery'}; } } } if ($err) { # we already have an error, so don't proceed } elsif (%results) { # user can specify a number of total results $params->{'count_results'} = $params->{'limit_total'} if ($params->{'limit_total'}); # the blue bar above the results $html_result_head = html_result_head($params); my $i = 0; # sort keys with sort function determined earlier for my $key (sort $sort_fn keys %results) { $i++; last if ($i > $params->{'count_results'}); $html_output .= html_result(\%{$results{$key}}); } # duplicate search box below results $html_output .= html_searchbox($params); } else { # no results # pluralize "word/words" properly my $word_s = ($params->{'n'} > 1) ? 'words' : 'word'; $err = qq[No results found for "$params->{'orig_term1'}" within $params->{'n'} $word_s of "$params->{'orig_term2'}".]; } $page_title = qq[GAPS: "$params->{'orig_term1'}" near "$params->{'orig_term2'}"]; } else { # we don't have two terms # 1 term entered (if neither term was entered, don't print an error) if ($params->{'term1'} || $params->{'term2'}) { $err = 'Please enter a term in each field.'; } $page_title = "Google API Proximity Search"; } # we're ready to output the page # first, the page header (title, site logo, etc.) print html_page_header($page_title, 'Proximity Search (GAPS)', "$docs_path/readme.html"); # the search form; subroutine will re-assign any values that were already entered print html_searchbox($params); # header above results (if any) print $html_result_head; # error message print html_error($err) if ($err); # the body of the results print $html_output; # page footer print <<"EOH"; EOH ###################### # END MAIN, BEGIN SUBS ###################### sub html_error { # take an error message and format it my ($err) = @_; return <<"EOH";

$err EOH } sub html_result_head { # generate blue bar above results my ($params) = @_; my $word_s = ($params->{'n'} > 1) ? 'words' : 'word'; my $additional = $params->{'additional'}; if ($additional) { $additional = " (+ $additional)"; } my $filtered = ($params->{'filter'} == 1) ? ' (filtered)' : ''; $params->{'count_total'} = insert_commas($params->{'count_total'}); return <<"EOH";
Found $params->{'orig_term1'} within $params->{'n'} $word_s of $params->{'orig_term2'}$additional.   First $params->{'count_results'} results of about $params->{'count_total'} from $params->{'count_queries'} queries$filtered.
EOH } sub html_searchbox { # generate search form, restoring whatever values are in params hash my ($params) = @_; # dropdown select lists; we want to pre-select an item in each one # (either the item previously chosen or the default) my %options = (); # "N words of" for (1 .. $max_distance) { my $selected = ($params->{'n'} == $_) ? ' selected' : ''; $options{'n'} .= qq{\n}; } # order of terms for (qw(that either)) { my $selected = ($params->{'order'} eq $_) ? ' selected' : ''; $options{'order'} .= qq{\n}; } # sort by for (qw(title URL ranking proximity)) { my $selected = ($params->{'sort'} eq $_) ? ' selected' : ''; $options{'sort'} .= qq{\n}; } # max. results per query for (1 .. 10) { my $selected = ($params->{'per_query'} == $_) ? ' selected' : ''; $options{'per_query'} .= qq{}; } # max. total results for (0, 10, 20, 30, 40, 50) { my $selected = ($params->{'limit_total'} == $_) ? ' selected' : ''; $options{'limit_total'} .= qq{'; } # "filter each query" checkbox my $filter_checked = ($params->{'filter'} == 1) ? ' checked' : ''; return <<"EOH";

Find within word(s) of   
     Additional terms:
Show results, with up to from each query   Filter each query
License key (optional)    ${\&key_message()}
EOH } sub html_result { # format a search result my ($result) = @_; # handle foreign characters and non-URL characters my $query = uri_escape(utf8_to_latin($result->{'query'}, \%utf8_cache)); my $url = uri_escape($result->{'URL'}); for (qw(title snippet)) { $result->{$_} = utf8_to_latin($result->{$_}, \%utf8_cache); } return <<"EOH";

$result->{'title'} [distance: $result->{'proximity'}]
$result->{'snippet'}
$result->{'URL'} - $result->{'cachedSize'} - Cached - Similar pages
EOH }