SVN: toys/rsget.pl/RSGet/Captcha.pm

sparky sparky at pld-linux.org
Wed Oct 21 21:18:06 CEST 2009


Author: sparky
Date: Wed Oct 21 21:18:06 2009
New Revision: 10793

Modified:
   toys/rsget.pl/RSGet/Captcha.pm
Log:
- unified captcha methods
- added rather useful captcha solving library


Modified: toys/rsget.pl/RSGet/Captcha.pm
==============================================================================
--- toys/rsget.pl/RSGet/Captcha.pm	(original)
+++ toys/rsget.pl/RSGet/Captcha.pm	Wed Oct 21 21:18:06 2009
@@ -3,23 +3,17 @@
 use strict;
 use warnings;
 use Digest::MD5 qw(md5_hex);
+use File::Path;
 use RSGet::Tools;
 set_rev qq$Id$;
 
-=unused
 def_settings(
-	allow_captcha => {
-		desc => "Allow captchas which need to be solved manually.",
-		default => "http",
-		allowed => qr/(http|yes|no)/,
-		dynamic => {
-			http => "Allow only if control page is opened.",
-			yes => "Allow always",
-			no => "Never allow",
-		},
+	captcha_save_results => {
+		desc => "Save captcha results, for captcha debugging.",
+		default => 0,
+		allowed => qr/\d+/,
 	},
 );
-=cut
 
 our %needed;
 our %solved;
@@ -29,21 +23,113 @@
 {
 	my $self = shift;
 	my $next_stage = shift;
-	my $ct = shift;
-
-	my $md5 = md5_hex( $self->{body} );
-
-	$needed{ $md5 } = [ $ct, $self->{body} ];
+	my $check = shift;
+	my %opts = @_;
 
-	$self->linedata( captcha => $md5 );
+	die "Getter error, captcha argument is not a regexp\n"
+		if not $check or ref $check ne "Regexp";
 
+	my $data = $self->{body};
+	my $md5 = md5_hex( $data );
 	$self->{captcha_md5} = $md5;
 	$self->{captcha_next} = $next_stage;
+	$self->{captcha_data} = \$data;
+
 	$self->{captcha_until} = time + 200;
+	delete $self->{captcha_response};
 
 	my $id = 0;
 	++$id while exists $waiting{ $id };
 	$waiting{ $id } = $self;
+
+	if ( my $solver = $opts{solver} ) {
+		my $text;
+		local $SIG{__DIE__};
+		delete $SIG{__DIE__};
+		eval {
+			$text = &$solver( $self->{captcha_data} );
+		};
+		if ( $@ ) {
+			warn "Captcha solver problem: $@\n";
+		} else {
+			p "Captcha solver returned: " . $text
+				if verbose( 2 );
+			$text = undef unless $text =~ /^$check$/;
+			return $self->solved_delay( $text );
+		}
+	}
+	if ( my $process = $opts{process} ) {
+		my $text;
+		local $SIG{__DIE__};
+		delete $SIG{__DIE__};
+		eval {
+			die "tesseract not found\n" unless require_prog( "tesseract" );
+			my $image = new RSGet::Captcha::Image( $self->{captcha_data} );
+			$text = &$process( $image );
+		};
+		if ( $@ ) {
+			warn "Captcha process problem: $@\n";
+		} else {
+			p "Captcha process returned: " . $text
+				if verbose( 2 );
+			$text = undef unless $text =~ /^$check$/;
+			return $self->solved_delay( $text );
+		}
+	}
+
+	# add to ask list
+	$needed{ $md5 } = [ $self->{content_type}, $self->{captcha_data} ];
+	$self->linedata( captcha => $md5 );
+}
+
+sub captcha_result
+{
+	my $self = shift;
+	my $result = shift;
+
+	my $name = $self->{captcha_md5};
+	delete $self->{captcha_md5};
+
+	return unless setting( "captcha_save_results" );
+	return unless $name;
+
+	my $subdir;
+	if ( not defined $result ) {
+		$subdir = "unsolved";
+	} elsif ( $result =~ /^(ok|fail)$/i ) {
+		$subdir = lc $result;
+		$name .= "_" . $self->{captcha_response};
+	} else {
+		warn "Captcha Result is not OK or FAIL: $result\n";
+		return;
+	}
+
+	my $getter = $getters{ $self->{_pkg} };
+	my $dir = "captcha/$getter->{short}/$subdir";
+	mkpath( $dir ) unless -d $dir;
+
+	my $file = "$dir/$name";
+	open my $f, ">", $file;
+	print $f ${$self->{captcha_data}};
+	close $f;
+
+	$self->log( "Saved $file" )
+		if verbose( 1 );
+}
+
+sub solved_delay
+{
+	my $self = shift;
+	my $captcha = shift;
+
+	$self->linedata( wait => "delay" );
+	$self->{captcha_response} = $captcha;
+	my $wait = 5 + int rand 10;
+	unless ( defined $captcha ) {
+		$wait /= 4;
+		$self->captcha_result( undef );
+	}
+	$self->{captcha_until} = time + $wait;
 }
 
 sub solved
@@ -51,6 +137,7 @@
 	my $self = shift;
 	my $captcha = shift;
 
+	$self->{captcha_response} = $captcha;
 	$self->{body} = $captcha;
 	$_ = $captcha;
 
@@ -63,6 +150,7 @@
 {
 	my $self = shift;;
 
+	$self->captcha_result( undef );
 	delete $self->{body};
 	$_ = undef;
 
@@ -77,16 +165,24 @@
 	foreach my $id ( keys %waiting ) {
 		my $obj = $waiting{ $id };
 		my $left = $obj->{captcha_until} - $time;
-		my $md5 = $obj->{captcha_md5};
+		my $md5 = $obj->{captcha_md5} || "";
+		my $captcha = $obj->{captcha_response};
 		if ( $left <= 0 ) {
-			$obj->print( "captcha not solved" );
-			unsolved( $obj );
+			if ( $captcha ) {
+				solved( $obj, $captcha );
+			} else {
+				$obj->print( "captcha not solved" );
+				unsolved( $obj );
+			}
 		} elsif ( $obj->{_abort} ) {
 			$obj->abort();
 		} elsif ( my $s = $solved{ $md5 } ) {
 			solved( $obj, $s );
 		} else {
-			$obj->print( "solve captcha " . s2string( $left ) );
+			$obj->print(
+				( $captcha ? "captcha solved: $captcha, delaying " : "solve captcha " )
+				. s2string( $left )
+			);
 			next;
 		}
 		delete $waiting{ $id };
@@ -95,6 +191,512 @@
 	RSGet::Line::status( 'captcha' => scalar keys %waiting );
 }
 
+package RSGet::Captcha::Image;
+use GD;
+use Math::Trig;
+
+# new from file data
+sub new # {{{
+{
+	my $class = shift;
+	my $imgdata = shift;
+
+	GD::Image->trueColor( 1 );
+	my $img = GD::Image->new( $$imgdata );
+
+	my $w = $img->width;
+	my $h = $img->height;
+
+	my @data;
+	for ( my $y = 0; $y < $h; $y++ ) {
+		my @line;
+		for ( my $x = 0; $x < $w; $x++ ) {
+			my $ci = $img->getPixel( $x, $y );
+			my @rgb = $img->rgb( $ci );
+			push @line, \@rgb;
+		}
+		push @data, \@line;
+	}
+
+	my $self = {
+		w => $w,
+		h => $h,
+		data => \@data,
+	};
+
+	bless $self, $class;
+} # }}}
+
+# new white image
+sub newWhite # {{{
+{
+	my $class = shift;
+	my $w = shift;
+	my $h = shift;
+
+	my @data;
+	for ( my $y = 0; $y < $h; $y++ ) {
+		my @line = map { 0xff } (1..$w);
+		push @data, \@line;
+	}
+	
+	my $self = {
+		w => $w,
+		h => $h,
+		data => \@data,
+	};
+
+	bless $self, $class;
+} # }}}
+
+# write bitmap file
+sub write_bmp # {{{
+{
+	my $self = shift;
+	my $name = shift;
+
+	my $wlen = $self->{w} * 3 + 3;
+	$wlen &= ~3;
+	my $size = $wlen * $self->{h};
+
+	my $line_pad = "\0" x ( $wlen - $self->{w} * 3 );
+
+	my @lines;
+	for ( my $y = $self->{h} - 1; $y >= 0; $y-- ) {
+		my $iline = $self->{data}->[$y];
+		my @oline;
+		foreach my $pix ( @$iline ) {
+			my @pix;
+			if ( ref $pix ) {
+				@pix = map { $_ < 0 ? 0 : $_ > 255 ? 255 : int $_ } @$pix[ (2, 1, 0) ];
+			} else {
+				my $p = $pix < 0 ? 0 : $pix > 255 ? 255 : int $pix;
+				@pix = ( $p, $p, $p );
+			}
+			push @oline, pack "CCC", @pix;
+		}
+		push @lines, join "", @oline, $line_pad;
+	}
+
+	my @header = ( 66, 77, 54 + $size, 0, 54, 40,
+		$self->{w}, $self->{h}, 1, 24, 0, $size, 2835, 2835, 0, 0 );
+
+	my $header = pack "CCVVVVVVvvVVVVVV", @header;
+
+	open F_OUT, ">", $name;
+	binmode F_OUT;
+	print F_OUT $header;
+	print F_OUT join "", @lines;
+	close F_OUT;
+} # }}}
+
+# $code should return luma (greyscale) value
+sub color_filter # {{{
+{
+	my $self = shift;
+	my $code = shift;
+
+	my $data = $self->{data};
+	foreach my $line ( @$data ) {
+		foreach my $pixel ( @$line ) {
+			$pixel = &$code( @$pixel );
+		}
+	}
+} # }}}
+
+# call $code for each pixel
+sub pix_filter # {{{
+{
+	my $self = shift;
+	my $code = shift;
+
+	my $w = $self->{w};
+	my $h = $self->{h};
+	for ( my $y = 0; $y < $h; $y++ ) {
+		for ( my $x = 0; $x < $w; $x++ ) {
+			my $pix = $self->pix( $x, $y );
+			&$code( $pix );
+		}
+	}
+} # }}}
+
+# bring $min..$max values to 0..255 interval
+sub luma_emphasize # {{{
+{
+	my $self = shift;
+	my $min = shift;
+	my $max = shift;
+	my $mult = 256 / ( $max - $min );
+
+	my $data = $self->{data};
+	foreach my $line ( @$data ) {
+		foreach ( @$line ) {
+			$_ = ( $_ - $min ) * $mult;
+		}
+	}
+} # }}}
+
+# clip luma to 0..255 values
+sub luma_clip # {{{
+{
+	my $self = shift;
+
+	my $data = $self->{data};
+	foreach my $line ( @$data ) {
+		foreach ( @$line ) {
+			$_ = $_ > 255 ? 255 : $_ < 0 ? 0 : $_;
+		}
+	}
+} # }}}
+
+# exponential to linear
+sub luma_degamma # {{{
+{
+	my $self = shift;
+	my $gamma = shift;
+
+	my $data = $self->{data};
+	foreach my $line ( @$data ) {
+		foreach ( @$line ) {
+			$_ = ($_ / 255) ** $gamma * 255;
+		}
+	}
+} # }}}
+
+# linear to exponential
+sub luma_togamma # {{{
+{
+	my $self = shift;
+	return $self->luma_degamma( 1 / shift );
+} # }}}
+
+sub histogram # {{{
+{
+	my $self = shift;
+
+	my @h = map { 0 } (0..255);
+	my $data = $self->{data};
+	foreach my $line ( @$data ) {
+		foreach ( @$line ) {
+			my $v = $_ < 0 ? 0 : $_ > 255 ? 255 : int $_;
+			$h[ $v ]++;
+		}
+	}
+
+	return \@h;
+} # }}}
+
+# set border pixels to some color
+sub set_border # {{{
+{
+	my $self = shift;
+	my $color = shift;
+
+	$self->set_lines( $color, 0, $self->{h} - 1 );
+	$self->set_columns( $color, 0, $self->{w} - 1 );
+} # }}}
+
+sub set_lines # {{{
+{
+	my $self = shift;
+	my $color = shift;
+	my @select = @_;
+
+	my $data = $self->{data};
+	foreach my $i ( @select ) {
+		my $line = $data->[ $i ];
+		foreach ( @$line ) {
+			$_ = $color;
+		}
+	}
+} # }}}
+sub set_columns # {{{
+{
+	my $self = shift;
+	my $color = shift;
+	my @select = @_;
+
+	my $data = $self->{data};
+	foreach my $line ( @$data ) {
+		foreach my $i ( @select ) {
+			$line->[ $i ] = $color;
+		}
+	}
+} # }}}
+
+# chop image into pieces
+sub chop # {{{
+{
+	my $self = shift;
+
+	my @left = (0, @_);
+	my @right = (@_, $self->{w});
+
+	my @parts;
+	for ( my $i = 0; $i < scalar @right; $i++ ) {
+		push @parts, $self->crop( x1 => $left[ $i ], x2 => $right[ $i ] - 1 );
+	}
+	return @parts;
+} # }}}
+
+sub img_rotate # {{{
+{
+=later
+	my $self = shift;
+	my $opts = shift;
+
+	my $select = $opts->{select} || 0;
+	$select = [ $select ] unless ref $select;
+
+	my $angle = $opts->{angle};
+	$angle = [ -$angle, +$angle ] unless ref $angle;
+
+	foreach my $i ( @$select ) {
+		my $img = $self->[ $i ];
+		my $best;
+		my $max = 0;
+		for ( my $a = $angle->[0]; $a <= $angle->[1]; $a += 15 ) {
+			my $r = $img->rotate( $a );
+			my $sum = $r->sum_columns( $opts->{sum} );
+			#print "Sum $i: $sum\n";
+			if ( $sum > $max ) {
+				$best = $r;
+				$max = $sum;
+			}
+		}
+		$self->add( $best );
+	}
+=cut
+} # }}}
+
+# call ocr program
+sub ocr # {{{
+{
+	my $self = shift;
+
+	my $rand = sprintf "%.6x", int rand 1 << 24;
+
+	my $bmp = "cap$rand.bmp";
+	my $txt = "cap$rand.txt";
+
+	unlink $bmp, $txt;
+	$self->write_bmp( $bmp );
+	
+	system "tesseract $bmp cap$rand 2>/dev/null";
+	
+	open my $f_in, "<", $txt;
+	my $text = <$f_in>;
+	close $f_in;
+	unlink $bmp, $txt;
+
+	return undef unless $text;
+	chomp $text;
+	return $text;
+} # }}}
<<diff output has been trimmed to 500 lines, 208 line(s) remained.>>


More information about the pld-cvs-commit mailing list