#!/usr/bin/perl -wT # (C) Kim Holburn 2011, 2016 # released under GNU Public License http://www.gnu.org/copyleft/gpl.html # script to generate pseudo-random phrases # $version = "1.6 2016-07-11"; # $version = "1.5 2016-07-09"; # version 1.4 2016-07-05 # version 1.3 2016-06-29 # version 1.2 2016-06-25 # version 1.1 2011-09-22 # word frequency lists: # http://www.kilgarriff.co.uk/bnc-readme.html # http://www.kilgarriff.co.uk/BNClists/all.al.gz # I expect a text file with an entry per line # space separated fields with frequency number and word being # all I use. # I have removed a lot of entries using these following greps # grep -Pi "^\d+ [a-z-_']+ " all.al > alln2.al # only entries with a-z A-Z - _ ' # grep -Pvi " '[^ ']*' " alln2.al>alln3.al # remove entries in quotes # grep -v '[-]-' alln3.al > alln4.al # remove entries with -- # grep -v '^1 ' alln4.al > alln5.al # remove entries with a frequency of 1 use strict; #use POSIX qw(strftime); use File::Basename; my $mydirname = dirname($0); my $myname = basename($0); my ($mynamex,$mydir,$myext) = fileparse($0); my $version = "1.6 2016-07-11"; my $mymode = ''; my $freq=0; my $dic=1; my $wordy=0; my $dict = "/usr/share/dict/words"; my $dict1 = "words"; my $fdict = "all.al"; #fdict1 my $outputfile = "-"; my @dict = (); my @fdict = (); my @mytext = (); my $make = 0; my $good = 1; my $uniq=0; my $number=5; my $times=5; my $listy=0; my $added=''; my $verbose=0; my $lower=1; my $upper=0; my $wmin=2; my $wmax=9; my $wany=0; my $space=" "; my $nospace=""; my $pop=1; my $alt=0; my $caps=0; my $inword=0; my $myeol=1; my $full=0; sub fail_usage { my (@mess) = @_; my $name = $0; $name =~ s#^.*/(.+)$#$1#; if (scalar @mess) { for (@mess) { print STDERR "$name Error : $_ \n"; } exit 1; } print STDERR <] [-t ] [-v [-v] verbose ] [-u] [-a] [-m|--min 2] [-x|--max 7] [-s " _-.,"] randword has four modes: -d|--dictionary (default) Use dictionary mode -f|--frequency = use frequency weighted word list (more frequently used words are more likely to be chosen) -r|--wordy = word-like random ouput -M|--make = turn text files into a dictionary file that can be used by this program. -L|--list = list ford files I can see These options are for any word output mode: -t|--times = output number of times (lines) (default $times) -w|--words = output number of words (default $number) -s|--spaces = space (default: " ") -s "" for no spaces more than one space character will be used randomly between each word -S|--no-space = choose this char from the -s string as empty char It's hard to explain this option. add a letter like say 3 to the space list then add -S "3" and whenever the 3 gets chosen as the space, the program adds no space. -m|--min = min word length (default: $wmin) -x|--max = max word length (default: $wmax) -a|--any-length = words of any length Choosing this option means the last two options are ignored. -A|--alternate = alternate way of choosing words. Normal way of choosing words is to choose a word length from the range of word lengths with equal weight, then choose a word of that length. Alternate method is to create a list of only words of allowed length and then choose from that list. There are far fewer, two character words, just saying. -C|--capitalize = capitalize fist letter of each word. -N|--no-eol = send no end of line character at end -F|--file "/path/to/dictionary/or/directory/of/dictionaries" (you can add as many of these as you like) $name will look for dictionaries in /randlib/ or /randwords/ where is the directory that $name is in. -F [ words popularity: ignore words of frequency less than n default is $pop Default frequency file is "${mydirname}/randlib/all.al". Frequency files should have at least two fields per line, a word and a frequency number, in either order but the order must be the same throughout the file. The fields can be separated by space, tab or comma. There can be more than two fields but only the first two fields are looked at. These options are for wordy mode only: -i|--inword allow some in word punctuation (' -) default is off Make dictionary mode takes any number of files it has two additional possible options --bad - show rejected words instead (diagnostic) -O|--ouputdict it is used like this: $name -M -F [ ] (output to STDOUT) $name -M -F ./shakespeare.txt -O ./sh.words $name -M -F ./jane-austen.txt ./shakespeare.txt > ./ws-ja.words $name -M -F ./shakespeare.txt --bad -O ./sh.words List dictionary mode takes takes only one option -l|--list-full list full path -h|--help = show this help screen Defaults are: $name -F "/usr/share/dict/words" -w $number -t $times -s " " $name -f -F "${mydirname}/randlib/all.al" -w $number -t $times -s " " -P $pop default mode is dictionary mode default number of words output is -w $number default number of times output is -t $times default min letter in word is -m $wmin default max letter in word is -x $wmax default spaces is " " default nospace is "" (off) Examples: Main way of using: $name (some random words) $name -f (frequency weighted random words) $name -r (random preudo words) $name -w 3 (three random words) EOM exit 1; } sub print_v { my $v = shift; if ($verbose>=$v) { print STDERR join ("", @_); } } sub findit { my $file = shift; my @paths = ("","$mydirname/randlib/","${0}s/","$mydirname/${mynamex}s/"); my @exts = ("", ".words"); for my $pp (@paths) { for my $ee (@exts) { my $ff = "${pp}${file}$ee"; if (-e $ff) { return $ff; } } } return ""; } while ($ARGV=$ARGV[0]) { if ($ARGV eq "-w" or $ARGV eq "--words") { shift @ARGV; if ($#ARGV < 0) { &fail_usage ("no number after -w option"); } $number=shift @ARGV; if ($number !~ /^[0-9]{1,}$/) { &fail_usage ("I don't understand ($number) as a number"); } if ($number <= 0) { &fail_usage ("word number must be greater than 0 (was $number)"); } } elsif ($ARGV eq "-t" or $ARGV eq "--times") { shift @ARGV; if ($#ARGV < 0) { &fail_usage ("no times after -t option"); } $times=shift @ARGV; if ($times !~ /^[0-9]{1,}$/) { &fail_usage ("I don't understand ($times) as a number"); } if ($times <= 0) { &fail_usage ("times must be greater than 0 (was $times)"); } } elsif ($ARGV eq "-x" or $ARGV eq "--max") { shift @ARGV; if ($#ARGV < 0) { &fail_usage ("no max after -x option"); } $wmax=shift @ARGV; if ($wmax !~ /^[0-9]{1,}$/) { &fail_usage ("I don't understand ($wmax) as a number"); } if ($wmax <= 0) { &fail_usage ("max must be greater than 0 (was $wmax)"); } } elsif ($ARGV eq "-m" or $ARGV eq "--min") { shift @ARGV; if ($#ARGV < 0) { &fail_usage ("no min after -m option"); } $wmin=shift @ARGV; if ($wmin !~ /^[0-9]{1,}$/) { &fail_usage ("I don't understand ($wmin) as a number"); } if ($wmin <= 0) { &fail_usage ("min must be greater than 0 (was $wmin)"); } } elsif ($ARGV eq "-P" or $ARGV eq "--pop") { shift @ARGV; if ($#ARGV < 0) { &fail_usage ("no min after -p option"); } $pop=shift @ARGV; if ($pop !~ /^[0-9]{1,}$/) { &fail_usage ("I don't understand ($pop) as a number"); } if ($pop <= 0) { &fail_usage ("pop must be greater than 0 (was $pop)"); } } elsif ($ARGV eq "-s" or $ARGV eq "--spaces") { shift @ARGV; if ($#ARGV < 0) { &fail_usage ("no string after -s option"); } $space=shift @ARGV; $space =~ s/[^[:print:]]//g; # only printable chars; } elsif ($ARGV eq "-S" or $ARGV eq "--no-space") { shift @ARGV; if ($#ARGV < 0) { &fail_usage ("no string after -S option"); } $nospace=shift @ARGV; $nospace =~ s/[^[:print:]]//g; # only printable chars $nospace =~ s/^(.).*/$1/g; # only one char } elsif ($ARGV eq "-O" or $ARGV eq "--output-dict") { shift @ARGV; if ($#ARGV < 0) { &fail_usage ("no string after -O option"); } $outputfile=shift @ARGV; $outputfile =~ s/[^[:print:]]//g; # only printable chars if (-e $outputfile) { fail_usage ("dictionary output file exists:($outputfile)"); } } elsif ($ARGV eq "-F" or $ARGV eq "--file" ) { my $myopt = $ARGV; my @myfiles = (); shift @ARGV; if (scalar @ARGV < 1) { fail_usage (" $myopt option must be followed by filename"); } while (my $file = $ARGV[0]) { # we hit an option, finish up if ($file =~ /^-/) { last; } if (! -e $file) { my $filet = findit ($file); # if (-e "$mydirname/randlib/$file") { $file = "$mydirname/randlib/$file"; } # elsif (-e "${0}s/$file") { $file = "${0}s/$file"; } # elsif (-e "$mydirname/${mynamex}s/$file") { $file = "${mynamex}s/$file"; } #my ($mynamex,$mydir,$myext) = fileparse($0); # else if (!$filet) { fail_usage ("Can't find user supplied ($file)"); } $file = $filet; } if (! -r $file) { fail_usage ("Can't read user supplied ($file)"); } if (-d $file) { # directory opendir my($dh), $file or die "Couldn't open dir '$file': $!"; my @files = grep { (! -l $_) && -r _ && -f _ } map { $_ = "$file/$_"; } readdir $dh; # read actual files but not directories closedir $dh; if (!scalar @files) { fail_usage ("Couldn't find files to read in directory ($file)"); } push (@myfiles, @files); } else { push (@myfiles, $file); } shift @ARGV; } if (scalar @myfiles < 1) { fail_usage (" $myopt option with no filename following"); } else { push (@dict, @myfiles); } } elsif ($ARGV eq "-L" or $ARGV eq "--list") { $mymode = 'L'; shift @ARGV; } elsif ($ARGV eq "-l" or $ARGV eq "--list-full") { $full = 1; shift @ARGV; } elsif ($ARGV eq "-M" or $ARGV eq "--make") { $mymode = 'M'; shift @ARGV; } elsif ($ARGV eq "-r" or $ARGV eq "--wordy") { $mymode = 'r'; shift @ARGV; } elsif ($ARGV eq "-i" or $ARGV eq "--inword") {$inword=1; shift @ARGV; } elsif ($ARGV eq "-f" or $ARGV eq "--frequency") { $mymode = 'f'; shift @ARGV; } elsif ($ARGV eq "-A" or $ARGV eq "--alternate") { $alt=1; shift @ARGV; } elsif ($ARGV eq "-N" or $ARGV eq "--no-eol") { $myeol=0; shift @ARGV; } elsif ($ARGV eq "-C" or $ARGV eq "--capitalize") { $caps=1; shift @ARGV; } elsif ($ARGV eq "-d" or $ARGV eq "--dictionary") { $mymode = 'd'; shift @ARGV; } elsif ($ARGV eq "-u" or $ARGV eq "--unique") { $uniq=1; shift @ARGV; } elsif ($ARGV eq "-a" or $ARGV eq "--any-length") { $wany=1; shift @ARGV; } elsif ($ARGV eq "-h" || $ARGV eq "--help") { &fail_usage (); } elsif ($ARGV eq "-v" or $ARGV eq "--verbose") { $verbose++; shift @ARGV; } elsif ($ARGV eq "--bad") { $good=0; shift @ARGV; } elsif ($ARGV eq "-V" or $ARGV eq "--version") { print ("$myname version $version\n"); exit 0; } elsif ($ARGV =~ /^-/) { &fail_usage ("unknown option \"$ARGV\""); } else { last; } } if (scalar @ARGV > 0) { &fail_usage ("extra arguments"); } if ($wmin > $wmax) { &fail_usage ("min word length must be less than or equal to max ($wmin > $wmax)"); } if (length $nospace) { if (length $nospace > 1) { fail_usage ("no space char can only be 1 character"); } if ( $space !~ /$nospace/) { fail_usage ("no space char not found in space char string"); } } if ($mymode eq '') { $mymode = 'd'; } #if ($mymode eq 'd') { $dic=1; $freq=0; $wordy=0; $make=0; } #elsif ($mymode eq 'f') { $dic=0; $freq=1; $wordy=0; $make=0; } #elsif ($mymode eq 'r') { $dic=0; $freq=0; $wordy=1; $make=0; } #elsif ($mymode eq 'M') { $dic=0; $freq=0; $wordy=0; $make=1; } if ($mymode eq 'f') { if (scalar @dict == 0) { my $filet = findit($fdict); if (!$filet or ! -e $filet) { fail_usage ("Can't find dictionary ($fdict)"); } if (! -r $filet) { fail_usage ("Can't read dictionary ($fdict)"); } push (@dict, $filet); } } elsif ($mymode eq 'd') { if (scalar @dict == 0) { if (! -e $dict) { $dict = findit($dict1); } if (! -e $dict) { fail_usage ("Can't find dictionary ($dict) or ($dict1)"); } if (! -r $dict) { fail_usage ("Can't read dictionary ($dict) or ($dict1)"); } push (@dict, $dict); } } elsif ($mymode eq 'M') { if (scalar @dict == 0) { fail_usage ("Can't find files to make a dictionary from"); } } if ($wany) { $wmax = 24; $wmin = 1; } print_v 1, ( "# " , "-$mymode ", "-w=$number ", "-m=$wmin ", "-x=$wmax ", $wany?"-a ":"", $uniq?"-u ":"", $alt?"-A ":"", $caps?"-C ":"", $myeol?"":"-N ", "-t=$times ", "-s=\"$space\" ", "-S=\"$nospace\" ", $good?"":"--bad", "-P=$pop \n# ", $full?"-l ":"", scalar @dict?" -F ".join(",",@dict)." \n# ":"", $mymode eq 'M'?"-O ($outputfile) \n# ":"", " options not used:\n# ", $wany?"":"(-a) ", $uniq?"":"(-u) ", $alt?"":"(-A) ", $caps?"":"(-C) ", $myeol?"(-N) ":"", $good?"(--bad) ":"", $full?"":"(-l) ", "\n" ); sub spacer () { my $spaced = ""; if (length($space) <= 0) { return $spaced; } if (length($space) == 1) { $spaced = $space; } else { $spaced = substr ($space, int(rand(length($space))), 1); } if ($spaced eq $nospace) { $spaced = ""; } return $spaced; } sub test_format ($\@) { my $mydict = shift; my $flines = shift; # test file format here my $count = 0; # if we can't tell after 1000 lines we give up my $style = -1; # style: # -1=IDK # 0=single word ('a') # 1 = word then number ('a 1' or 'a 1 xx') # 2 = number then word ('1 a' or '1 a xx') # 3 = word then other stuff ('a xxx') for my $line (@$flines) { # line is single word - '\s\t,.'? if ($line !~ /[^a-z0-9_'-]/i) { $style = 0; last; } elsif ($line =~ /^\d+[\s,\.]+\d+/) { ; } # inconclusive (number could be a word) elsif ($line =~ /^[^\s,\.]+[\s,\.]+\d+/) { $style = 1; last; } # freq 2nd elsif ($line =~ /^\d+[\s,\.]+[^\s,\.]+/) { $style = 2; last; } # freq 1st elsif ($line =~ /^[^\s,\.]+[\s,\.]+[^\s,\.]+/) { $style = 3; last; } # just words, use first else { &fail_usage ("I don't understand the format of file ($mydict)"); } if ($count > 1000) { &fail_usage ("The format of file ($mydict)? Nah, all I got is 2 numbers."); } $count ++; } print_v 2, "# debug ($mydict) style=($style)\n"; if ($style == -1) { &fail_usage ("No, I don't understand the format of file ($mydict)"); } return $style; } my %fwords = (); my @words = (); my $total = 0; sub read_files () { # my @dict = @_; # my @lines=(); foreach my $mydict (@dict) { my @flines = (); if (-f $mydict) { { # slurping the file to a scalar speeds the read by 3 times local $/ = undef; open( my $fh, "<", $mydict ) or die "Can't open $mydict: $!"; my $fline = <$fh>; close $fh; @flines = split (/\s*[\r\n]+\s*/,$fline); } print_v 2, "# debug read file ($mydict) (",scalar @flines,")\n"; # trim and chomp etc. #map {s/^\s+//; s/\s+$//;} @flines; @flines = grep {/\S/} @flines; # get rid of empties # test format here my $style = test_format($mydict, @flines); print_v 1, "# debug read file ($mydict) l(",scalar @flines ,") style=($style)\n"; if ($mymode eq 'd') { if ($style == 0) { push @words, @flines; } elsif ($style == 2) #{ push @words, map { my ($b,$a) = split (/[\s,\.]+/, $_, 3); $a; } @flines; } { map { if (/^\d+[\s,\.]+([^\s,\.]+)\b/) { push @words, $1; } } @flines; } else # style == 1 or 3 #{ push @words, map { my ($a) = split (/[\s,\.]+/, $_, 2); $a; } @flines; } { map { if (/^([^\s,\.]+)/) { push @words, $1; } } @flines; } print_v 2, "# debug -d ($mydict) (",scalar @words,")\n"; } else { # freq if ($style == 0 or $style == 3) { if ($mymode eq 'f') { &fail_usage ("file ($mydict) is a simple dictionary but you want a frequency list"); } } my $re; if ($style == 1) { $re = qr/^(?P[^\s,\.\d]+)[\s,\.]+(?P\d+)/; } else { $re = qr/^(?P\d+)[\s,\.]+(?P[^\s,\.\d]+)/; } # ($style == 2) #map { # ok add these words # my ($f1, $f2) = split (/[\s,\.]+/, $_, 3) ; # if ($style == 1) { $fwords{$f1} += $f2; } # else { $fwords{$f2} += $f1; } # } @flines; map { if (/$re/) { $fwords{$+{word}} += $+{freq} ; $total += $+{freq}; } } @flines; print_v 2, "# debug freqfile ($mydict) (",scalar keys %fwords,")\n"; } } } } if ($mymode eq 'd') { &read_files(); # my @words = split /[\s\n\r]+/, &read_files (@dict); # 'a', '1 a', 'a 1', '1 a xxx', 'a 1 xxx' my $wordn = scalar @words; if ($uniq) { my @uniqs; my %seen = (); @uniqs = grep { ! $seen{$_} ++ } @words; # remove duplicates @words = @uniqs; } my $un = scalar @words; if ($alt && ! $wany) { my @alt = grep { my $len= length; ($wmin <= $len) and ($len <= $wmax) } @words ; @words = @alt; $wany = 1; } my $an = scalar @words; print_v 1, "# words=($wordn), unique=($un), in range=($an)\n"; my $dist = $wmax - $wmin; print_v 1, "# dist = ($dist)\n"; my @list; my @nlist; for (1..$times) { my $out = ""; if ($wany) { for (1..$number) { if (length ($out) > 0) { $out .= spacer(); } my $ww = $words[int(rand(scalar @words))]; if ($caps) { $ww = ucfirst($ww); } $out .= $ww; } } else { for (1..$number) { my $wordl = $wmin; if ($dist) { $wordl += int(rand($dist)) } if (!defined $list[$wordl]) { my @lwords = grep { length == $wordl } @words; $list[$wordl] = \@lwords; $nlist[$wordl] = scalar @lwords; } my $lwords = $list[$wordl]; #my $rnlwords = scalar @$lwords; my $rnlwords = $nlist[$wordl]; if (1<$verbose) { print STDERR "# word len = ($wordl) words = ($rnlwords) \n"; } if (length ($out) > 0) { $out .= spacer(); } my $ww = $$lwords[int(rand($rnlwords))]; if ($caps) { $ww = ucfirst($ww); } $out .= $ww; } } print_v 2, "# $out\n"; print $out; if ($myeol) { print "\n"; } # print "$out\n"; } } elsif ($mymode eq 'f') { # frequency distributions # my $total = 0; sub weighted_rand (\%$) { my $dists = shift; my $total = shift; my ($key, $weight); my $rand; while (1) { # to avoid floating point inaccuracies $rand = rand ($total); while (($key, $weight) = each %$dists ) { # { return $key if ($rand -= $weight) < 0; } if (($rand -= $weight) < 0) { if ($caps) { return ucfirst($key); } else { return $key; } } } } } &read_files (); print_v 2, "# entries = (" . scalar (keys %fwords) . ") \n"; if ($alt && ! $wany) { $total = 0; while (my ($key, $value) = each (%fwords)) { if ($value < $pop) { delete $fwords{$key}; } else { my $len = length($fwords{$key}); if ($len < $wmin or $wmax < $len) { delete $fwords{$key}; } else { $total += $value; } } } $wany = 1; } else { if (1 < $pop) { $total = 0; while (my ($key, $value) = each (%fwords)) { if ($value < $pop) { delete $fwords{$key}; } else { $total += $value; } } } } print_v 1, "# entries = (" . scalar (keys %fwords) . "), total=($total)\n"; my $dist = $wmax - $wmin; print_v 1, "# dist = ($dist)\n"; my @list; my @tots; for (1..$times) { my $out = ""; if ($wany) { for (1..$number) { if (length ($out) > 0) { $out .= spacer(); } $out .= weighted_rand (%fwords, $total); } } else { for (1..$number) { my $wordl = $wmin; if ($dist) { $wordl += int(rand($dist)) } if (!defined $list[$wordl]) { my %lwords = () ; my $totl = 0; map { if (length == $wordl) { $lwords{$_} += $fwords{$_}; $totl += $fwords{$_} } } (keys %fwords); $list[$wordl] = \%lwords; $tots[$wordl] = $totl; print_v 2, "# wordlist[$wordl]=[", scalar keys %lwords,"] tot=$totl \n"; } my $lwords = $list[$wordl]; my $rnlwords = scalar keys %$lwords; my $totl = $tots[$wordl]; #print_v 2, "# word len = ($wordl) words = ($rnlwords) \n"; if (1<$verbose) { print STDERR "# word len = ($wordl) words = ($rnlwords) \n"; } if (length ($out) > 0) { $out .= spacer(); } $out .= weighted_rand (%$lwords,$totl); } } print_v 2, "# $out\n"; print $out; if ($myeol) { print "\n"; } # print "$out\n"; } } elsif ($mymode eq 'r') { # wordy sub lett { my @ch = @_; my $ind = int(rand(scalar @ch)); # if ($verbose) { print "debug \n";} print_v 2, " # debug ch=(",join ('', @ch),") [$ind]=\"$ch[$ind]\"\n"; $ch[$ind]; } my @vowels=qw(a e i o u y); my @consonants=qw(b c d f g h j k l m n p q r s t v w x y z); # my @v; # my @c; # push @v, @vowels; # push @c, @consonants; my @p = qw(' -); if (!$inword) { @p = (); } sub wordy { my $number = shift; my $word = ""; # if ($verbose) { print "# c = $times \n"; } my ($v, $c) = ('v', 'c'); my $didpunc=0; print_v 2, " punct=(", join ('', @p), ") \n"; my $punc=$number*2 + 5; my $t = (int(rand(3)))?$c:$v; for my $i (1..$number) { my $ch = lett (($t eq 's')?@p:(($t eq 'v')?@vowels:@consonants)); $word .= $ch; print_v 2, " # i=($i) t=($t) chars=(", scalar @p, ") v=($v) c=($c) ch($ch)\n"; if ($t eq 'v') { ($v, $c) = ('v', 'c'); } else { ($c, $v) = ('v', 'c'); } if ($t eq 's') { $punc = 1000; $didpunc++; } $t = (!(scalar @p) || ($i>=($number-1)) || int(rand($punc)))? ((int(rand(3)))?$c:$v):'s'; } print_v 2,"$word "; if ($caps) { $word = ucfirst ($word); } return $word; } my $dist = $wmax - $wmin; print_v 1, "# dist = ($dist)\n"; for (1..$times) { my $out=""; for (1..$number) { my $wordl = $wmin; if ($dist) { $wordl += int(rand($dist)) } if (length ($out) > 0) { $out .= spacer(); } $out .= wordy($wordl); } print $out; if ($myeol) { print "\n"; } } } elsif ($mymode eq 'M') { # makedict mode my $out = ""; if ($outputfile eq '-') { $out = '-'; } else { # untaint filename if ($outputfile =~ m#^([a-z0-9~'"_+/\.-]+)$#i) { $out = $1; } else { &fail_usage ("output file ($outputfile) invalid characters in filename"); } if ( -e $out ) { &fail_usage ("output file ($out) exists"); } } print_v 1, ( "# making text files (", join("'",@dict), ") \n", "# into a dictionary to ($out)\n", ); my %words = (); # if ($good) { # map {$words{lc($_)}++} grep { /^[a-z0-9'-]+$/i and not /''|--/ } # split(/[^a-z0-9]*[\s\t\n\r]+[^a-z0-9]*/i, " ". # join ('\n', map { local (@ARGV, $/) = $_; <> } @mytext)." "); # } if ($good) { map {$words{lc($_)}++} grep { !/^[\d'-]+$/ and /^[a-z0-9'-]+$/i } split(/[^a-z0-9]*(?:[\s\t\n\r]+|--+|[,;\.!\?\(\)\{\}\[\]]+)[^a-z0-9]*/i, " ". join ('\n', map { local (@ARGV, $/) = $_; <> } @dict )." " ); } else { map {$words{lc($_)}++} grep { /^[\d+'-]$/ or !/^[a-z0-9'-]+$/i } split(/[^a-z0-9]*(?:[\s\t\n\r]+|--+|[,;\.!\?\(\)\{\}\[\]]+)[^a-z0-9]*/i, " ". join ('\n', map { local (@ARGV, $/) = $_; <> } @dict )." " ); } #my @words = sort { "\U$a" cmp "\U$b" } grep { $words{$_} > 1} (keys %words); my @words = sort { $words{$b} <=> $words{$a} || $a cmp $b } (keys %words); #my @words = sort (keys %words); my $fh; if ($good and $out ne '-') { open( my $fh, ">", $out ) or die "Can't write to $out $!"; select ($fh); } map { print "$_ $words{$_}\n" } @words; if ($fh) { close $fh; } } else { # list mode my @paths = ("$mydirname/randlib", "$mydirname/${mynamex}s"); if ("${0}s" ne "$mydirname/${mynamex}s") { push @paths, "${0}s"; } my @exts = ("", ".words"); my $found = 0; my $founddir = 0; for my $dir (@paths) { if (-d $dir) { # directory $founddir ++; opendir my($dh), $dir or die "Couldn't open dir '$dir': $!"; my @files = grep { (! -l $_) && -r _ && -f _ } map { $_ = "$dir/$_"; } readdir $dh; # read actual files but not directories closedir $dh; $found += scalar @files; my @simple = (); if (!$full) { @simple = map { my $x = $_; $x=~s{^$dir/}{}x; $x; } @files; } print "$dir/:\n"; if (scalar @files) { my $joiner = $full?"\n ":" "; print " ", join ($joiner, $full?@files:@simple), "\n"; } } } }