#!/usr/bin/perl use 5.012; use warnings; use Getopt::Long; use Pod::Usage; use Text::CSV; my $out; my $verb = 1; my $help = 0; GetOptions( 'output|o=s' => \$out, 'verbose|v+' => \$verb, 'help|h|?' => \$help, ) or pod2usage(-exitstatus => 2); if ($help) { pod2usage(-exitstatus => 0, -verbose => $verb); } my ($key, $doc) = @ARGV; unless (defined $key and defined $doc) { pod2usage(-exitstatus => 2); } unless (defined $out) { $out = $doc =~ s/(?:\.[^.]+)?$/.csv/r; } $|++ if ($verb > 1); say "Reading keywords from '$key' ..." if ($verb > 2); my @keywords = do { open my $in, '<', $key or die "Error opening keyword file: $!"; my %unique; while (my $_ = <$in>) { chomp; for my $keyword (split /[\s.:!?,;()]+/) { $unique{$keyword} = 1; } } keys %unique; }; say scalar(@keywords), " keywords read" if ($verb > 1); say "Scanning document '$doc', writing output to '$out' ..." if ($verb > 2); my $ispdf = do { open my $in, '<', $doc or die "Error opening document file: $!"; read $in, my $magic, 4; $magic eq '%PDF'; }; my $src; if ($ispdf) { say "Document seems to be a PDF file" if ($verb > 2); open $src, '-|', 'pdftotext', $doc, '-' or die "Error opening document stream: $!"; } else { say "Document does not seem to be a PDF file" if ($verb > 2); open $src, '<', $doc or die "Error opening document file: $!"; } open my $tgt, '>', $out or die "Error opening output file: $!"; my $csv = Text::CSV->new({binary => 1, eol => $/}); $csv->print($tgt, [qw(Page Word Keyword Sentence)]); my $page = 0; my $word = 0; my $sentence = ''; my @hits = (); my $total = 0; while (my $_ = <$src>) { chomp; while ($_ ne '') { if (s/^\f//) { $page += 1; $word = 0; } elsif (s/^([^\s.:!?,;()]+)//) { my $candidate = $1; for my $keyword (@keywords) { if ($candidate eq $keyword) { print "$page,$word ... " if ($verb > 2); push @hits, [$page, $word, $keyword]; } } $sentence .= ' ' if ($sentence ne ''); $sentence .= $candidate; $word += 1; } elsif (s/^([.:!?,;()])//) { $sentence .= $1; for my $hit (@hits) { push @$hit, $sentence; $csv->print($tgt, $hit); } $total += @hits; $sentence = ''; @hits = (); } else { s/^\s+//; } } } say "Done" if ($verb > 2); say "$total matches found" if ($verb > 1); close $src or die "Failed to close document stream: $!"; close $tgt or die "Failed to close output stream: $!"; __END__ =head1 NAME keywords - Find keywords in PDF or text files =head1 SYNOPSIS keywords [OPTION ...] KEYWORDS DOCUMENT =head1 OPTIONS =over 4 =item B<--output=FILE> =item B<-o FILE> Write output to the given file. If no such option is given, the output filename is constructed by replacing the extension of the input document by C<.csv>. =item B<--verbose> =item B<-v> Increases the verbosity of program output. Up to two instances of this option currently make sense. =item B<--help> =item B<-h> =item B<-?> Shows documentation about the program. Combine with B<--verbose> to view the entire manual page. =back =head1 DESCRIPTION This program reads a list of keywords from a file and scans another file for occurrences of those keywords. Both the keyword and document file are split into words separated by whitespace or any of the sentence separator characters C<.:!?,;()>. If the document file is not plain text but a PDF file, it is automatically filtered through the program C and the output is scanned instead. While scanning the document, each occurrence of a keyword is printed to the output in CSV format. The fields printed are =over 4 =item the current page number, determined by counting form feeds; =item the number of the word counting from the start of the page; =item the matched keyword and =item the sentence in which the keyword occurred. =back =head1 LICENSE Copyright (c) 2013 by Thomas Chust L This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . =cut