SVN: toys/rsget.pl/RSGet/Captcha.pm
sparky
sparky at pld-linux.org
Wed Oct 21 21:18:06 CEST 2009
Author: sparky
Date: Wed Oct 21 21:18:06 2009
New Revision: 10793
Modified:
toys/rsget.pl/RSGet/Captcha.pm
Log:
- unified captcha methods
- added rather useful captcha solving library
Modified: toys/rsget.pl/RSGet/Captcha.pm
==============================================================================
--- toys/rsget.pl/RSGet/Captcha.pm (original)
+++ toys/rsget.pl/RSGet/Captcha.pm Wed Oct 21 21:18:06 2009
@@ -3,23 +3,17 @@
use strict;
use warnings;
use Digest::MD5 qw(md5_hex);
+use File::Path;
use RSGet::Tools;
set_rev qq$Id$;
-=unused
def_settings(
- allow_captcha => {
- desc => "Allow captchas which need to be solved manually.",
- default => "http",
- allowed => qr/(http|yes|no)/,
- dynamic => {
- http => "Allow only if control page is opened.",
- yes => "Allow always",
- no => "Never allow",
- },
+ captcha_save_results => {
+ desc => "Save captcha results, for captcha debugging.",
+ default => 0,
+ allowed => qr/\d+/,
},
);
-=cut
our %needed;
our %solved;
@@ -29,21 +23,113 @@
{
my $self = shift;
my $next_stage = shift;
- my $ct = shift;
-
- my $md5 = md5_hex( $self->{body} );
-
- $needed{ $md5 } = [ $ct, $self->{body} ];
+ my $check = shift;
+ my %opts = @_;
- $self->linedata( captcha => $md5 );
+ die "Getter error, captcha argument is not a regexp\n"
+ if not $check or ref $check ne "Regexp";
+ my $data = $self->{body};
+ my $md5 = md5_hex( $data );
$self->{captcha_md5} = $md5;
$self->{captcha_next} = $next_stage;
+ $self->{captcha_data} = \$data;
+
$self->{captcha_until} = time + 200;
+ delete $self->{captcha_response};
my $id = 0;
++$id while exists $waiting{ $id };
$waiting{ $id } = $self;
+
+ if ( my $solver = $opts{solver} ) {
+ my $text;
+ local $SIG{__DIE__};
+ delete $SIG{__DIE__};
+ eval {
+ $text = &$solver( $self->{captcha_data} );
+ };
+ if ( $@ ) {
+ warn "Captcha solver problem: $@\n";
+ } else {
+ p "Captcha solver returned: " . $text
+ if verbose( 2 );
+ $text = undef unless $text =~ /^$check$/;
+ return $self->solved_delay( $text );
+ }
+ }
+ if ( my $process = $opts{process} ) {
+ my $text;
+ local $SIG{__DIE__};
+ delete $SIG{__DIE__};
+ eval {
+ die "tesseract not found\n" unless require_prog( "tesseract" );
+ my $image = new RSGet::Captcha::Image( $self->{captcha_data} );
+ $text = &$process( $image );
+ };
+ if ( $@ ) {
+ warn "Captcha process problem: $@\n";
+ } else {
+ p "Captcha process returned: " . $text
+ if verbose( 2 );
+ $text = undef unless $text =~ /^$check$/;
+ return $self->solved_delay( $text );
+ }
+ }
+
+ # add to ask list
+ $needed{ $md5 } = [ $self->{content_type}, $self->{captcha_data} ];
+ $self->linedata( captcha => $md5 );
+}
+
+sub captcha_result
+{
+ my $self = shift;
+ my $result = shift;
+
+ my $name = $self->{captcha_md5};
+ delete $self->{captcha_md5};
+
+ return unless setting( "captcha_save_results" );
+ return unless $name;
+
+ my $subdir;
+ if ( not defined $result ) {
+ $subdir = "unsolved";
+ } elsif ( $result =~ /^(ok|fail)$/i ) {
+ $subdir = lc $result;
+ $name .= "_" . $self->{captcha_response};
+ } else {
+ warn "Captcha Result is not OK or FAIL: $result\n";
+ return;
+ }
+
+ my $getter = $getters{ $self->{_pkg} };
+ my $dir = "captcha/$getter->{short}/$subdir";
+ mkpath( $dir ) unless -d $dir;
+
+ my $file = "$dir/$name";
+ open my $f, ">", $file;
+ print $f ${$self->{captcha_data}};
+ close $f;
+
+ $self->log( "Saved $file" )
+ if verbose( 1 );
+}
+
+sub solved_delay
+{
+ my $self = shift;
+ my $captcha = shift;
+
+ $self->linedata( wait => "delay" );
+ $self->{captcha_response} = $captcha;
+ my $wait = 5 + int rand 10;
+ unless ( defined $captcha ) {
+ $wait /= 4;
+ $self->captcha_result( undef );
+ }
+ $self->{captcha_until} = time + $wait;
}
sub solved
@@ -51,6 +137,7 @@
my $self = shift;
my $captcha = shift;
+ $self->{captcha_response} = $captcha;
$self->{body} = $captcha;
$_ = $captcha;
@@ -63,6 +150,7 @@
{
my $self = shift;;
+ $self->captcha_result( undef );
delete $self->{body};
$_ = undef;
@@ -77,16 +165,24 @@
foreach my $id ( keys %waiting ) {
my $obj = $waiting{ $id };
my $left = $obj->{captcha_until} - $time;
- my $md5 = $obj->{captcha_md5};
+ my $md5 = $obj->{captcha_md5} || "";
+ my $captcha = $obj->{captcha_response};
if ( $left <= 0 ) {
- $obj->print( "captcha not solved" );
- unsolved( $obj );
+ if ( $captcha ) {
+ solved( $obj, $captcha );
+ } else {
+ $obj->print( "captcha not solved" );
+ unsolved( $obj );
+ }
} elsif ( $obj->{_abort} ) {
$obj->abort();
} elsif ( my $s = $solved{ $md5 } ) {
solved( $obj, $s );
} else {
- $obj->print( "solve captcha " . s2string( $left ) );
+ $obj->print(
+ ( $captcha ? "captcha solved: $captcha, delaying " : "solve captcha " )
+ . s2string( $left )
+ );
next;
}
delete $waiting{ $id };
@@ -95,6 +191,512 @@
RSGet::Line::status( 'captcha' => scalar keys %waiting );
}
+package RSGet::Captcha::Image;
+use GD;
+use Math::Trig;
+
+# new from file data
+sub new # {{{
+{
+ my $class = shift;
+ my $imgdata = shift;
+
+ GD::Image->trueColor( 1 );
+ my $img = GD::Image->new( $$imgdata );
+
+ my $w = $img->width;
+ my $h = $img->height;
+
+ my @data;
+ for ( my $y = 0; $y < $h; $y++ ) {
+ my @line;
+ for ( my $x = 0; $x < $w; $x++ ) {
+ my $ci = $img->getPixel( $x, $y );
+ my @rgb = $img->rgb( $ci );
+ push @line, \@rgb;
+ }
+ push @data, \@line;
+ }
+
+ my $self = {
+ w => $w,
+ h => $h,
+ data => \@data,
+ };
+
+ bless $self, $class;
+} # }}}
+
+# new white image
+sub newWhite # {{{
+{
+ my $class = shift;
+ my $w = shift;
+ my $h = shift;
+
+ my @data;
+ for ( my $y = 0; $y < $h; $y++ ) {
+ my @line = map { 0xff } (1..$w);
+ push @data, \@line;
+ }
+
+ my $self = {
+ w => $w,
+ h => $h,
+ data => \@data,
+ };
+
+ bless $self, $class;
+} # }}}
+
+# write bitmap file
+sub write_bmp # {{{
+{
+ my $self = shift;
+ my $name = shift;
+
+ my $wlen = $self->{w} * 3 + 3;
+ $wlen &= ~3;
+ my $size = $wlen * $self->{h};
+
+ my $line_pad = "\0" x ( $wlen - $self->{w} * 3 );
+
+ my @lines;
+ for ( my $y = $self->{h} - 1; $y >= 0; $y-- ) {
+ my $iline = $self->{data}->[$y];
+ my @oline;
+ foreach my $pix ( @$iline ) {
+ my @pix;
+ if ( ref $pix ) {
+ @pix = map { $_ < 0 ? 0 : $_ > 255 ? 255 : int $_ } @$pix[ (2, 1, 0) ];
+ } else {
+ my $p = $pix < 0 ? 0 : $pix > 255 ? 255 : int $pix;
+ @pix = ( $p, $p, $p );
+ }
+ push @oline, pack "CCC", @pix;
+ }
+ push @lines, join "", @oline, $line_pad;
+ }
+
+ my @header = ( 66, 77, 54 + $size, 0, 54, 40,
+ $self->{w}, $self->{h}, 1, 24, 0, $size, 2835, 2835, 0, 0 );
+
+ my $header = pack "CCVVVVVVvvVVVVVV", @header;
+
+ open F_OUT, ">", $name;
+ binmode F_OUT;
+ print F_OUT $header;
+ print F_OUT join "", @lines;
+ close F_OUT;
+} # }}}
+
+# $code should return luma (greyscale) value
+sub color_filter # {{{
+{
+ my $self = shift;
+ my $code = shift;
+
+ my $data = $self->{data};
+ foreach my $line ( @$data ) {
+ foreach my $pixel ( @$line ) {
+ $pixel = &$code( @$pixel );
+ }
+ }
+} # }}}
+
+# call $code for each pixel
+sub pix_filter # {{{
+{
+ my $self = shift;
+ my $code = shift;
+
+ my $w = $self->{w};
+ my $h = $self->{h};
+ for ( my $y = 0; $y < $h; $y++ ) {
+ for ( my $x = 0; $x < $w; $x++ ) {
+ my $pix = $self->pix( $x, $y );
+ &$code( $pix );
+ }
+ }
+} # }}}
+
+# bring $min..$max values to 0..255 interval
+sub luma_emphasize # {{{
+{
+ my $self = shift;
+ my $min = shift;
+ my $max = shift;
+ my $mult = 256 / ( $max - $min );
+
+ my $data = $self->{data};
+ foreach my $line ( @$data ) {
+ foreach ( @$line ) {
+ $_ = ( $_ - $min ) * $mult;
+ }
+ }
+} # }}}
+
+# clip luma to 0..255 values
+sub luma_clip # {{{
+{
+ my $self = shift;
+
+ my $data = $self->{data};
+ foreach my $line ( @$data ) {
+ foreach ( @$line ) {
+ $_ = $_ > 255 ? 255 : $_ < 0 ? 0 : $_;
+ }
+ }
+} # }}}
+
+# exponential to linear
+sub luma_degamma # {{{
+{
+ my $self = shift;
+ my $gamma = shift;
+
+ my $data = $self->{data};
+ foreach my $line ( @$data ) {
+ foreach ( @$line ) {
+ $_ = ($_ / 255) ** $gamma * 255;
+ }
+ }
+} # }}}
+
+# linear to exponential
+sub luma_togamma # {{{
+{
+ my $self = shift;
+ return $self->luma_degamma( 1 / shift );
+} # }}}
+
+sub histogram # {{{
+{
+ my $self = shift;
+
+ my @h = map { 0 } (0..255);
+ my $data = $self->{data};
+ foreach my $line ( @$data ) {
+ foreach ( @$line ) {
+ my $v = $_ < 0 ? 0 : $_ > 255 ? 255 : int $_;
+ $h[ $v ]++;
+ }
+ }
+
+ return \@h;
+} # }}}
+
+# set border pixels to some color
+sub set_border # {{{
+{
+ my $self = shift;
+ my $color = shift;
+
+ $self->set_lines( $color, 0, $self->{h} - 1 );
+ $self->set_columns( $color, 0, $self->{w} - 1 );
+} # }}}
+
+sub set_lines # {{{
+{
+ my $self = shift;
+ my $color = shift;
+ my @select = @_;
+
+ my $data = $self->{data};
+ foreach my $i ( @select ) {
+ my $line = $data->[ $i ];
+ foreach ( @$line ) {
+ $_ = $color;
+ }
+ }
+} # }}}
+sub set_columns # {{{
+{
+ my $self = shift;
+ my $color = shift;
+ my @select = @_;
+
+ my $data = $self->{data};
+ foreach my $line ( @$data ) {
+ foreach my $i ( @select ) {
+ $line->[ $i ] = $color;
+ }
+ }
+} # }}}
+
+# chop image into pieces
+sub chop # {{{
+{
+ my $self = shift;
+
+ my @left = (0, @_);
+ my @right = (@_, $self->{w});
+
+ my @parts;
+ for ( my $i = 0; $i < scalar @right; $i++ ) {
+ push @parts, $self->crop( x1 => $left[ $i ], x2 => $right[ $i ] - 1 );
+ }
+ return @parts;
+} # }}}
+
+sub img_rotate # {{{
+{
+=later
+ my $self = shift;
+ my $opts = shift;
+
+ my $select = $opts->{select} || 0;
+ $select = [ $select ] unless ref $select;
+
+ my $angle = $opts->{angle};
+ $angle = [ -$angle, +$angle ] unless ref $angle;
+
+ foreach my $i ( @$select ) {
+ my $img = $self->[ $i ];
+ my $best;
+ my $max = 0;
+ for ( my $a = $angle->[0]; $a <= $angle->[1]; $a += 15 ) {
+ my $r = $img->rotate( $a );
+ my $sum = $r->sum_columns( $opts->{sum} );
+ #print "Sum $i: $sum\n";
+ if ( $sum > $max ) {
+ $best = $r;
+ $max = $sum;
+ }
+ }
+ $self->add( $best );
+ }
+=cut
+} # }}}
+
+# call ocr program
+sub ocr # {{{
+{
+ my $self = shift;
+
+ my $rand = sprintf "%.6x", int rand 1 << 24;
+
+ my $bmp = "cap$rand.bmp";
+ my $txt = "cap$rand.txt";
+
+ unlink $bmp, $txt;
+ $self->write_bmp( $bmp );
+
+ system "tesseract $bmp cap$rand 2>/dev/null";
+
+ open my $f_in, "<", $txt;
+ my $text = <$f_in>;
+ close $f_in;
+ unlink $bmp, $txt;
+
+ return undef unless $text;
+ chomp $text;
+ return $text;
+} # }}}
<<diff output has been trimmed to 500 lines, 208 line(s) remained.>>
More information about the pld-cvs-commit
mailing list