#!/usr/bin/perl

# configure
use constant FILE => 'walden.txt';
use constant STOP => 'stopwords.inc';

# include
use strict;
require 'subroutines.pl';

# parse
my @words = &list_words( FILE );

# count words
my %word_count = ();
for ( my $i = 0; $i <= $#words; $i++ ) { $word_count{ $words[ $i ] }++ }

# construct bi-grams
my @bigrams = ();
for ( my $i = 0; $i < $#words; $i++ ) { $bigrams[ $i ] = $words[ $i ] . ' ' . $words[ $i + 1 ] }

# count bi-grams
my %bigram_count = ();
for ( my $i = 0; $i < $#words; $i++ ) { $bigram_count{ $bigrams[ $i ] }++ }

# display (sans stopwords, etc.)
my $stopwords = &slurp_words( STOP );
foreach my $bigram ( sort { $bigram_count{ $b } <=> $bigram_count{ $a } } keys %bigram_count ) {

	my ( $first_token, $second_token ) = split / /, $bigram;
	
	# remove stopwords, etc.
	next if ( $first_token =~ /[,.?!:;()\-]/ );
	next if ( $$stopwords{ $first_token } );
	next if ( $second_token =~ /[,.?!:;()\-]/ );
	next if ( $$stopwords{ $second_token } );

	print "$bigram_count{ $bigram } $bigram \n";
	
}