#!/usr/bin/perl -w
use strict;
use constant DEBUG => 0;
use constant MAX_KEY_SIZE => 64;
# Convert all alpha characters in string to bytes in [0..25] range, and
# remove all other characters.
sub ConvertInput($)
{
my ($data) = @_;
$data = lc $data;
$data =~ y/a-z//cd;
$data =~ y/a-z/\0-\31/;
return $data;
}
# Compute character frequency distribution from byte array
sub GetDistribution($$)
{
my ($data, $dist) = @_;
if( DEBUG )
{
die unless defined $data;
die unless length($data) > 0;
die unless defined $dist;
}
# Get frequencies
my @freq = ();
$freq[$_] = 0 foreach (0..25);
$freq[$_]++ foreach unpack 'C*', $data;
# Convert frequencies to probabilities
@$dist = map {$_ / length($data)} @freq;
}
# Split string into array of bytes by stride
sub SplitToStrides($$$)
{
my ($input, $stride, $output) = @_;
if( DEBUG )
{
die unless defined $input;
die unless length($input) > 0;
die unless defined $output;
}
$#$output = $stride - 1;
my $index = 0;
foreach my $c (unpack 'C*', $input)
{
$$output[$index++] .= chr($c);
$index %= $stride;
}
}
# Find best shift offset for a single distribution.
#
# Returns (offset, confidence) pair, where "confidence" is some
# measure of how likely this offset is correct, higher is better.
sub GetShiftOffset($$)
{
my ($dist, $input) = @_;
if( DEBUG )
{
die unless defined $input;
die unless length($input) > 0;
die unless defined $dist;
}
my @input_dist;
GetDistribution($input, \@input_dist);
my @offsets = ();
for(my $o = 0; $o < 26; $o++)
{
# Apply shift offset and compute error amount
my $error = 0;
for(my $i = 0; $i < 26; $i++)
{
my $diff = $$dist[$i] - $input_dist[($i + $o) % 26];
$error += $diff * $diff;
}
# Record (error, shift offset) pair
push @offsets, [$error, $o];
}
# Sort offsets by error amount
@offsets = sort {$$a[0] <=> $$b[0]} @offsets;
# If we guessed the right key size, the best fitting shift amount
# usually results in significantly less error than the next best
# fit. This is due to how most people choose keys with distinct
# characters, such when an incorrect key size is used, each stride
# contains samples of characters that are shifted by different
# amounts instead of a single shift amount, and the resulting mix
# of character frequencies means no single shift amount is
# significantly better than other shift amounts.
#
# Thus, this scheme doesn't work well when people use keys with
# repeated characters, such as "banana".
#
# This difference in error amount is returned in units of standard
# deviation, which the caller will make use of after having
# collected all key candidates.
my $average_error = 0;
foreach my $p (@offsets)
{
$average_error += $$p[0];
}
$average_error /= 26;
my $error_deviation = 0;
foreach my $p (@offsets)
{
my $delta = $$p[0] - $average_error;
$error_deviation += $delta * $delta;
}
$error_deviation = sqrt($error_deviation / 26);
# If error_deviation is close to zero, it means the characters were
# pretty much uniformly distributed, and it's not possible to crack.
if( $error_deviation < 1e-6 )
{
return (0, 0);
}
my $margin = $offsets[1][0] - $offsets[0][0];
# Compute confidence in log units of standard deviation. Anything
# in the same threshold bucket seem equally good or bad:
# - 1 < confidence
# - 0.5 < confidence <= 1
# - 0.25 < confidence <= 0.5
# - confidence <= 0.25
#
# We bucketize these confidence values since they aren't really
# that precise, and if there are two key candidates with comparable
# confidence, we want to keep both of them.
my $confidence = $margin / $error_deviation;
$confidence = $confidence < 0.25 ? 0 :
$confidence < 0.5 ? 1 :
$confidence < 1 ? 2 : 3;
return ($offsets[0][1], $confidence);
}
# Find the best key that would minimize error for a particular key length.
# Returns (key, confidence), where confidence is a value that indicates the
# likelihood of this key being the right one, higher is better.
#
# If no key with confidence above $threshold is found, return (undef, 0)
sub FindKeyWithLength($$$$)
{
my ($dist, $input, $key_size, $threshold) = @_;
# Split input to strides by key size. Because key size is much
# smaller than input, this guarantees that each stride will always
# have a few characters in it.
my @strides;
SplitToStrides($input, $key_size, \@strides);
# Get candidate shift amounts for each stride
my $key = "";
my $min_confidence = undef;
for(my $i = 0; $i < $key_size; $i++)
{
my ($offset, $confidence) = GetShiftOffset($dist, $strides[$i]);
$key .= chr($offset);
if( !defined($min_confidence) || $min_confidence > $confidence )
{
$min_confidence = $confidence;
if( $min_confidence < $threshold )
{
return (undef, 0);
}
}
}
return ($key, $min_confidence);
}
# Convert shift amounts to lowercase key
sub ConvertToLowercaseKey($)
{
my ($key) = @_;
return join '', map {chr(ord('a') + (26 - $_) % 26)} unpack 'C*', $key;
}
# Convert shift amounts to uppercase key
sub ConvertToUppercaseKey($)
{
my ($key) = @_;
return join '', map {chr(ord('A') + $_)} unpack 'C*', $key;
}
# Find candidate key in string of bytes. Output possible keys to stdout.
sub Crack($$)
{
my ($dist, $input) = @_;
# Try key lengths up to 1/5 of original input size. This is done
# by simply searching through all key sizes linearly.
#
# A more efficient way to do this is to try common divisors for all
# offsets of repeated strings, but that doesn't work so well with
# short input strings and we end up having to try all sizes
# linearly anyways.
my $max_key_size = length($input) / 5;
if( $max_key_size > MAX_KEY_SIZE )
{
$max_key_size = MAX_KEY_SIZE;
}
# Minimum accepted confidence threshold. Because we sort keys by
# confidence later, if we already found keys with higher confidence,
# there is no need to examine partial keys with lower confidence.
my $min_confidence = 0;
my %found_keys = ();
for(my $key_size = 1; $key_size < $max_key_size; $key_size++)
{
if( DEBUG )
{
print "Key size = $key_size\n";
}
my ($key, $confidence) =
FindKeyWithLength($dist, $input, $key_size, $min_confidence);
next unless defined $key;
next if exists $found_keys{$key};
# Remember confidence value for this key
$found_keys{$key} = $confidence;
# Remove other keys that are multiples of this key. This is so that
# if we see a key candidate in the form of "XX", we would return just
# "X" as the canonical key.
my $base_key = $key;
for(my $j = $key_size; $j < $max_key_size; $j += $key_size)
{
$key .= $base_key;
$found_keys{$key} = 0;
}
# Update minimum confidence level. This new key must be at equal or
# higher confidence, otherwise it would have been rejected already.
$min_confidence = $confidence;
}
# Output key with the highest confidence, break ties by preferring
# longer keys.
foreach my $key (sort {$found_keys{$b} <=> $found_keys{$a} ||
length($b) <=> length($a)}
keys %found_keys)
{
if( DEBUG )
{
print $found_keys{$key}, " -> ", ConvertToUppercaseKey($key), "\n";
}
last if $found_keys{$key} == 0;
print ConvertToUppercaseKey($key), "\n",
ConvertToLowercaseKey($key), "\n";
if( DEBUG ) { next; }
return;
}
}
# Expected character probabilities, from "Alice in Wonderland".
# http://www.gutenberg.org/ebooks/11
my @expected_distribution = (
.07969,
.01419,
.02439,
.04446,
.12517,
.01936,
.02392,
.06414,
.07019,
.00191,
.01048,
.04236,
.02005,
.06546,
.07705,
.01600,
.00178,
.05374,
.05909,
.09919,
.03234,
.00782,
.02400,
.00143,
.02101,
.00065,
);
# Find candidate keys for stdin
my $input = join '', <>;
Crack(\@expected_distribution, ConvertInput($input));