#!/usr/bin/perl -w # OverCluster 0.1, Written October 17, 2004, revised November 11th # A tool for querying Overture's keyword tool and clustering results # Copyright 2004, Ethan Zuckerman # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA # 02111-1307, USA. my $USAGE = <<_usage_; Usage: OverCluster is a tool that automates querying of the Overture Keyword Suggestion tool (located at http://inventory.overture.com/d/searchinventory/suggestion/) OverCluster is called from the command line, accepting as its only argument a text file. Each line of the text file contains a search term that is passed to the Overture engine. OverCluster feeds all search terms to Overture. For each term, it returns the "secondary terms" Overture associates with that term. For instance, if you search for "Alabama", Overture will return "sweet home alabama", "university of alabama" and "alabama football" as secondary terms. OverCluster calculates and displays how common those secondary searches are in relationship to the primary term. It also clusters secondary terms, displaying the most popular secondary terms and the primary terms that invoke them. OverCluster produces an HTML file which is placed in a dated results directory. The parent directory for these results is set in a variable within the script, as is the parent directory for input files. If you are unexpectedly seeing this message, it is likely because you have invoked OverCluster without a source file, or with a source file the script does not have permission to read. Check your permissions and try again. _usage_ use strict; use LWP::UserAgent; use Date::Calc; ### system-specific variables ### ### change this to a directory where you have permission ### to create directories and files my $sourcedir = "/Users/ethan/"; my $destination = "/Users/ethan/results/"; ### # globals my @terms; # our array of primary terms, read from file my %sectermhash; # our hash of secondary terms, returned from the # site we're scraping my $browser; # our browser object handle my $target = "http://inventory.overture.com/d/searchinventory/suggestion/"; # the site we're pointing our scraper at my $depth = 40; # how many secondary terms per term be considered my $cluster = 4; # minimum cluster size (value of '4' will give # you clusters of 5 or larger) # Find today's date, format it my ( $s,$m,$h,$mday,$mon,$year,$wday,$yday,$isdst ) = localtime( time ); my $longdate = localtime( time ); my @months = ( '01','02','03','04','05','06','07','08','09','10','11','12' ); if( $mday < 10 ) { $mday = "0$mday" } $year+=1900; my $date = "$year$months[$mon]$mday"; # Die if we've got no arguments, or if someone is asking for help unless ($ARGV[0]) {die $USAGE;} if ($ARGV[0] eq "-h") {die $USAGE;} # Check for the existance of a directory for today, create one # if need be, create output file unless (-d "$destination$date") {mkdir "$destination$date", 0777}; open (OUT, ">$destination$date/overture$ARGV[0].html") or die $!; # subroutine, calls LWP to make a post request and return # responses as an unnamed array sub do_POST{ $browser = LWP::UserAgent->new() unless $browser; my $resp = $browser->post(@_); return ($resp->content, $resp->status_line, $resp->is_success, $resp) if wantarray; return unless $resp->is_success; return $resp->content; } # take our terms file, clean it up and push it into our @terms array open(TERMS, $sourcedir.$ARGV[0]) or die("could not open $ARGV[0] for reading\n $USAGE"); while(my $line = ) { # ditch any characters in the search string that are not word # characters or spaces. Trim leading and trailing spaces. $line =~ s/^\([\w|\s]+\)//; $line =~ s/^\s+|\s+$//g; push(@terms,lc($line)); } close(TERMS); # print out an HTML header set, and a table of search terms # linked to their results later on the page print OUT ''; print OUT "Term clusters from Overture, $months[$mon]".'/'."$mday".'/'."$year"; print OUT ''; print OUT 'Search Terms | Clusters
'; # Format the search terms in a table. Link them to results later in # the page. Set the $max value to limit the number of items you # get in each row. my $max = 7; my $count = 0; for my $term (@terms) { if ($count == 0) {print OUT '';} print OUT ''; $count++; if ($count > $max) {print OUT ''."\n"; $count = 0} } print OUT '
'."$term".'
'; # for every term in the array, print out that term, the total # searches for that term, and the secondary terms associated # as set in the global $depth variable. Counts for the secondary # terms are expressed as percentages of the main term result. for my $term (@terms) { print OUT '

'."$term:\t".''; # Use the subroutine to get the results from Overture my ($doc, $status, $success, $resp) = do_POST($target, ['term'=>"$term"]); # if we fail, sleep for 10 seconds and try again unless ($success) { sleep (10); my ($doc, $status, $success, $resp) = do_POST($target, ['term'=>"$term"]);} # split the resulting document into individual lines # using the \ \;(\d+?)\<\/td\>/){ $denom = $1; print OUT "$denom".'
';} } # if we're not already out of our depth, and if # there's a size=1> tag fragment # we've got a secondary term # Needless to say, this also can and will break. print OUT '

'; foreach my $line (@lines) { if (($line =~ /size\=1\>\ \;(\d+?)\<\/td\>/) && ($count < $depth)) { my $num = $1; my $ratio; # the unless statement avoids nastiness # should we fail to get a term match unless ($denom == 0) {$ratio = ($num/$denom);} else {$ratio = "N/A";} # this is an ugly way to round off to # two decimal places and 4 sig. digits $ratio = ($ratio * 10000); $ratio = int($ratio); $ratio = ($ratio/100); $line =~ /color\=\#000000\>(.+?)\<\/a\>/; my $secterm = $1; # Eliminate the main term from the secondary # term. Trim leading and trailing spaces. # Eliminate orphaned n's $secterm =~ s/$term//; $secterm =~ s/(^\s+|\s+$)//; $secterm =~ s/(^n\s|^an\s|^ian\s)//; # And here's where it gets ugly. The central # data structure for this program is a hash # of arrays. But the hash has multiple vals # per key. Perl is really good about this - # it just creates an array of those multiple # values. But that means you're dealing with # a hash of arrays of arrays. This evil # little statement creates an anonymous # array, containing the search term, the # number of results, and the ratio of # results to total results, and pushes the # reference to this array into the anon. # array referenced by %sectermhash, key # $secterm. push (@{$sectermhash{$secterm}},[$term, $ratio, $num]); # create a link to the primary and secondary search terms # on google. Would make more sense to do this on yahoo # but they prevent their urls from being called this way print OUT ''."$secterm:\t".''; printf OUT ("%0.2f", $ratio); print OUT '%'."\t $num".'
'; $count++;} } # my $line in @lines print OUT '
'."\n"; sleep (10); } #my $nation in @countries # Begin outputting the clusters section of the document print OUT ''; # sort the secondary hash. For every relevant cluster, print it # in table form, with a link to the appropriate anchor $max = 6; $count = 0; foreach my $term (sort {scalar(@{$sectermhash{$b}}) <=> scalar (@{$sectermhash{$a}})} keys %sectermhash) { if (scalar (@{$sectermhash{$term}}) > $cluster) { if ($count == 0) {print OUT '';} print OUT ''; $count++; if ($count > $max) {print OUT ''."\n"; $count = 0} }} print OUT '
'."$term".'
'; # sort the secondary term hash by the number of primary terms # that included it. scalar(@{$sectermhash{$a}}) gives you the # number of arrays associated with this key, each of which # contains an original $term, $ratio and $results foreach my $term (sort {scalar(@{$sectermhash{$b}}) <=> scalar (@{$sectermhash{$a}})} keys %sectermhash) { my $thiscount = (scalar @{$sectermhash{$term}}); if ($thiscount > $cluster) { print OUT "\n".'
'; print OUT '

'."$term\t$thiscount".'
'; # @{$sectermhash{$term}} is an array containing references # to each of the results arrays. We sort on the ratios in # each of those arrays. my @sorted = sort {$b->[1] <=> $a->[1]} @{$sectermhash{$term}}; # to sort by number of subsidiary results, use this: # my @sorted = sort {$b->[2] <=> $a->[2]} @{$sectermhash{$term}}; # to sort subsidiary results alphabetically, use this: # my @sorted = sort {$b->[0] cmp $a->[0]} @{$sectermhash{$term}}; foreach my $term (@sorted) { my $cleanterm = $term->[0]; my $percent = $term->[1]; my $results = $term->[2]; print OUT "$percent".'%
'."$cleanterm".''."\t$results".'
'}; }} exit();