ishiducaの日記 このページをアンテナに追加 RSSフィード

2011-11-05

[][]いまさら「YAPC::ASIA 2011 Tokyo 行ってきたので」(追記: 2011.11.07) 21:30 いまさら「YAPC::ASIA 2011 Tokyo 行ってきたので」(追記: 2011.11.07) - ishiducaの日記 を含むブックマーク はてなブックマーク - いまさら「YAPC::ASIA 2011 Tokyo 行ってきたので」(追記: 2011.11.07) - ishiducaの日記 いまさら「YAPC::ASIA 2011 Tokyo 行ってきたので」(追記: 2011.11.07) - ishiducaの日記 のブックマークコメント

YAPC::ASIA 2011 Tokyoでは Marc Lehmann さんのトークの中で「アニメ」を連呼していたのがよかったですね。

(中略)ということで、マルチなダウンロードには AnyEvent を使うのがいいんだな! と勝手に合点して、AnyEvent::Pixiv::Download というのを書いた。今更感がすごいけど。

追記: github に上げました https://github.com/ishiduca/p5-AnyEvent-Pixiv-Download 原理的には同じなんだけど、メソッド名とか変わってます (2011.11.07)

サンプルのダウンロードスクリプト(dl.pl)

#!/usr/bin/env perl
use strict;
use warnings;
use Config::Pit;
use AnyEvent::Pixiv::Download;

my $config = pit_get('www.pixiv.net', require => {
    pixiv_id => '', pass => '',
});

my $cv = AE::cv;

my $client = AnyEvent::Pixiv::Download->new(
    pixiv_id => $config->{pixiv_id},
    pass     => $config->{pass},
);

for my $illust_id (qw/22854495 22856417 22855011 22849905/) {

    $client->to_mode_medium($illust_id, 'deep', sub {
        my $information = shift;
        my $illust_top_url     = $information->{illust_top_url};

        for my $img_src (@{$information->{contents}}) {

            $cv->begin;
            $client->download($img_src, $illust_top_url, sub {
                my(undef, $headers) = @_;
                warn "!! finish ", $headers->{URL}, "\n";
                $cv->end;
            });
        }
    });
}

$cv->recv;

1;

多重コールバックになるUIとか、リ○○ラをいちいち指定しなくちゃいけないのがださくて仕方ないので、ツッコミください


lib/AnyEvent/Pixiv/Download.pm

package AnyEvent::Pixiv::Download;

use warnings;
use strict;
use Carp;
use AnyEvent;
use AnyEvent::HTTP;
use Web::Scraper;
use File::Basename;
use Data::Dumper;

our $VERSION = '0.01';

my $www_pixiv_net = 'http://www.pixiv.net';
my $login_php     = "${www_pixiv_net}/login.php";
my $mypage_php    = "${www_pixiv_net}/mypage.php";
my $illust_top    = "${www_pixiv_net}/member_illust.php?mode=medium&illust_id=";

sub new {
    my $class = shift;
    my %args  = @_;

    my $self  = bless {}, $class;

    $self->{pixiv_id}   = delete $args{pixiv_id} || Carp::croak qq(! faild: "pixiv_id" not found);
    $self->{pass}       = delete $args{pass}     || Carp::croak qq(! failed: "pass" not found);
    $self->{verbose}    = delete $args{verbose}  || 1; # this test mode !
    $self->{cookie_jar} = {};
    $self->{information_mode_medium} = {};

    $self->login;

    return $self;
}

sub login {
    my $self = shift;

    my $sub_cv = AE::cv;
    my $login; $login = http_request('POST' => $login_php,
        headers => { 'content-type' => 'application/x-www-form-urlencoded' },
        body    => "mode=login&pixiv_id=$self->{pixiv_id}&pass=$self->{pass}",
        recurse => 0,
        sub {
            my($body, $headers) = @_;
            warn Dumper $headers             if $self->{verbose} == 2;
            warn qq(fetch: "${login_php}"\n) if $self->{verbose} == 1;

            Carp::croak qq(! failed: "set-cookie" not found at $headers->{URL}\n)
                unless $headers->{'set-cookie'};

            $self->{cookie_jar} = _cookie_jar_hogehoge($headers->{'set-cookie'})
                or Carp::croak qq(! failed: something wrong...\n);

            warn Dumper $self->{cookie_jar}                   if $self->{verbose} == 2;
            warn qq(get_cookie: "$headers->{'set-cookie'}"\n) if $self->{verbose} == 1;

            my $location = $headers->{'location'};

            undef $login;

            my $redirect; $redirect = http_request('GET' => $location,
                cookie_jar => $self->{cookie_jar},
                sub {    
                    my($body, $headers) = @_;
                    Carp::croak qq(! failed: "redirect" failed\n  $headers->{URL}\n)
                        if $headers->{URL} ne $location;

                    warn Dumper $headers                if $self->{verbose} == 2;
                    warn qq(fetch: "$headers->{URL}"\n) if $self->{verbose} == 1;
                    
                    undef $redirect;
                    $sub_cv->send("sucess: login !\n");
                }
            );
        }
    );
    my $message = $sub_cv->recv;
    warn $message;

    return $self;
}

sub to_mode_medium {
    my $self      = shift;
    my $cb        = pop;
    my $illust_id = shift || Carp::croak qq(! failed: "illust_id" not found\n);
    my $deep      = shift;

    my $mode_medium; $mode_medium = http_request('GET', "${illust_top}${illust_id}",
        cookie_jar => $self->{cookie_jar},
        sub {
            my($body, $headers) = @_;
            warn Dumper $headers                if $self->{verbose} == 2;
            warn qq(fetch: "$headers->{URL}"\n) if $self->{verbose} == 1;

            Carp::croak qq(! failed: something wrong...\n  $headers->{URL}\n)
                if $headers->{URL} ne "${illust_top}${illust_id}";

            my $information  = _scrape_mode_medium($body, $headers->{URL});

            if ($deep) {
                my $mode_big; $mode_big = http_request('GET', $information->{contents_url},
                    cookie_jar => $self->{cookie_jar},
                    headers    => { referer => $headers->{URL} },
                    sub {
                        my($body, $headers) = @_;

                        Carp::croak qq(! failed: something wrong...\n  $headers->{URL}\n)
                            if $headers->{URL} ne $information->{contents_url};

                        warn Dumper $headers                if $self->{verbose} == 2;
                        warn qq(fetch: "$headers->{URL}"\n) if $self->{verbose} == 1;

                        if ($information->{contents_url} =~ /mode=manga/) {
                            $information->{mode} = 'manga';
                            $information->{contents} = [];
                            while ($body =~ m!(http://img\d\d\.pixiv\.net/img/[^']+?)'!g) {
                                push @{$information->{contents}}, $1;
                            }
                        } else { # $information->{contents_url} =~ /mode=big/
                            $information->{mode} = 'big';
                            my $scraper = scraper {
                                process '//div/a/img[1]', 'img_src' => '@src';
                            };
                            $information->{contents} = [ ($scraper->scrape($body))->{img_src} ];
                        }

                        $self->{information_mode_medium} = $information;
                        warn Dumper $information if $self->{verbose} == 2;
                        warn Dumper $information if $self->{verbose} == 1;

                        undef $mode_big;
                        undef $mode_medium;
                        $cb->($information);
                    }
                );
            } else {
                $self->{information_mode_medium}->{$illust_id} = $information;
                warn Dumper $information if $self->{verbose} == 2;
                warn Dumper $information if $self->{verbose} == 1;

                undef $mode_medium;
                $cb->($information);
            }
        }
    );

    return $self;
}

sub download {
    my $self    = shift;
    my $cb      = pop;
    my $img_src = shift || Carp::croak qq(! failed: "img_src" not found\n);;
    my $referer = shift || Carp::croak qq(! failed: "referer" not found\n);
    my $options = shift;

    my $on_body = ($options->{on_body}) ? $options->{on_body} : (sub {
        my $filename = basename $img_src;
        $filename =~ s/\?.*$//;
        open my $fh, '>', $filename or Carp::croak qq(! failed: "${filename}" $!\n);
        binmode $fh;
        return sub {
            my($partial_body, $headers) = @_;
            if ($headers->{Status} =~ /^2/) {
                print $fh $partial_body;
            }
            return 1;
        };
    })->();

    my $done; $done = http_request('GET' => $img_src,
        cookie_jar => $self->{cookie_jar},
        headers    => { referer => $referer },
        on_header => sub {
            my $headers = shift;
            if ($headers->{Status} ne '200') {
                ($options->{on_error} || sub { die @_ })->(qq(failed: "${img_src}" $headers->{Status} $headers->{Reason}\n));
                return ;
            }
            return 1;
        },
        on_body => $on_body,
        sub {
            my($body, $headers) = @_;
            undef $done;
            $cb->(@_);
        }
    );

    return $self;
}

sub _scrape_mode_medium {
    my $body           = shift || Carp::croak qq(! failed: "body" not found\n);
    my $illust_top_url = shift || Carp::croak qq(! failed: "illust_top_url" not found\n);

    my $scraper = scraper {
        process '//h3[1]', 'title' => 'TEXT';
        process '//p[@class="works_caption"]', 'description' => 'HTML';
        process '//a[@class="avatar_m"]', 'author_name' => '@title';
        process '//a[@class="avatar_m"]', 'author_url'  => [ '@href', sub {
            return $www_pixiv_net . $_;
        } ];
        process '//div[@class="works_display"]/a[1]', 'contents_url' => [ '@href', sub {
            return join '/', $www_pixiv_net, $_;
        } ];
        process '//div[@class="works_display"]/a[1]/img[1]', 'img_src' => '@src';
    };

    my $information = $scraper->scrape($body);
    $information->{author} = {};
    $information->{author}->{name} = delete $information->{author_name};
    $information->{author}->{url}  = delete $information->{author_url};
    $information->{illust_top_url} = $illust_top_url;
    return $information;
}

sub _cookie_jar_hogehoge {
    local $_ = shift || return ;
    my %cookie = ();
    my $phpsessid = 'PHPSESSID';
    map{
        my($key, $value) = split /=/;
        $cookie{$key} = $value;
    }(split /; /);

    Carp::croak  qq(! failed: "${phpsessid}" not found) unless $cookie{$phpsessid};

    return {
        $cookie{domain} => {
            $cookie{path} => {
                $phpsessid => {
                    _expires => AnyEvent::HTTP::parse_date $cookie{expires},
                    value    => $cookie{$phpsessid},
                },
            },
        },
        version => 1,
    };
}


1;

トラックバック - http://perl.g.hatena.ne.jp/ishiduca/20111105