Utilisateur:KelBot/KelBot.pm

Un article de Wikipédia, l'encyclopédie libre.

package KelBot;
 
use strict;
use WWW::Mechanize;
use HTML::Entities;
use URI::Escape;
use XML::Simple;
use Carp;
use Encode;
use URI::Escape qw(uri_escape_utf8);
 
my $default_username = "";
my $default_password = "";
my $limit=5000;
 
sub new {
    my $package = shift;
    my $agent = shift || 'KelBot'; 
 
    my $self = bless {}, $package;
    $self->{mech} = WWW::Mechanize->new( cookie_jar => {}, onerror => \&Carp::carp );
    $self->{mech}->agent("$agent");
    $self->{host}   = 'fr.wikipedia.org';
    $self->{path}   = 'w';
    $self->{debug}  = 0;
    $self->{errstr} = '';
    return $self;
}
 
sub _get {
    my $self      = shift;
    my $page      = shift;
    my $action    = shift || 'view';
    my $extra     = shift;
    my $no_escape = shift || 0;
 
    my $url =
      "http://$self->{host}/$self->{path}/index.php?title=$page&action=$action";
    $url .= $extra if $extra;
    print STDERR  "Retrieving $url\n" if $self->{debug};
    my $res = $self->{mech}->get($url);
 
    if ( $res->is_success() ) {
        if ( $res->content =~ m/The action you have requested is limited to users in the group (.+)\./ ) {
            my $group = $1;
            $group =~ s/<.+?>//g;
            $self->{errstr} = qq/Error requesting $page: You must be in the user group "$group"/;
            carp $self->{errstr};
            return 0;
        } else {
            return $res;
        }
    } else {
        $self->{errstr} = "Error requesting $page: " . $res->status_line();
        carp $self->{errstr};
        return 0;
    }
}
 
sub _get_api {
    my $self  = shift;
    my $query = shift;
    print STDERR  "Retrieving http://$self->{host}/$self->{path}/api.php?$query\n"
      if $self->{debug};
    my $res =
      $self->{mech}->get("http://$self->{host}/$self->{path}/api.php?$query");
    if ( $res->is_success() ) {
        return $res;
    } else {
        $self->{errstr} = "Error requesting api.php?$query: " . $res->status_line();
        carp $self->{errstr};
        return 0;
    }
}
 
sub _put {
    my $self    = shift;
    my $page    = shift;
    my $options = shift;
    my $extra   = shift;
    my $res     = $self->_get( $page, 'edit', $extra );
    unless ($res) { return; }
    if ( ( $res->content ) =~ m/<textarea .+?readonly='readonly'/ ) {
        $self->{errstr} = "Error editing $page: Page is protected";
        carp $self->{errstr};
        return 0;
    }
    $res = $self->{mech}->submit_form( %{$options} );
    return $res;
}
 
sub set_wiki {
    my $self = shift;
    $self->{host} = shift;
    $self->{path} = shift;
    print STDERR  "Wiki set to http://$self->{host}/$self->{path}\n" if $self->{debug};
    return 0;
}
 
sub login {
    my $self     = shift;
    my $editor   = shift || $default_username;
    my $password = shift || $default_password;
    my $cookies  = ".perlwikipedia-$editor-cookies";
    $self->{mech}->cookie_jar(
        { file => $cookies, autosave => 1 } );
    if ( !defined $password ) {
        $self->{mech}->{cookie_jar}->load($cookies);
        my $cookies_exist = $self->{mech}->{cookie_jar}->as_string;
        if ($cookies_exist) {
            $self->{mech}->{cookie_jar}->load($cookies);
            print STDERR  "Loaded MediaWiki cookies from file $cookies\n" if $self->{debug};
            return 0;
        } else {
            $self->{errstr} = "Cannot load MediaWiki cookies from file $cookies";
            carp $self->{errstr};
            return 1;
        }
    }
    my $res = $self->_put(
        'Special:Userlogin',
        {
            form_name => 'userlogin',
            fields    => {
                wpName     => $editor,
                wpPassword => $password,
                wpRemember => 1,
            },
        }
    );
    unless ($res) { return; }
 
    my $content = $res->content();
    my $login_status;
 
    if ( $content =~ m/var wgUserName = "$editor"/ ) {
        print STDERR  qq/Login as "$editor" succeeded.\n/ if $self->{debug};
        $login_status = 0;
    } else {
        if ( $content =~ m/There is no user by the name/ ) {
            $self->{errstr} = qq/Login as "$editor" failed: User "$editor" does not exist/;
        } elsif ( $content =~ m/Incorrect password entered/ ) {
            $self->{errstr} = qq/Login as "$editor" failed: Bad password/;
        } elsif ( $content =~ m/Password entered was blank/ ) {
            $self->{errstr} = qq/Login as "$editor" failed: Blank password/;
        }
        $login_status = 1;
    }
 
    die "I can't log in." unless ($login_status eq '0');
 
    return $login_status;
}
 
sub edit {
    my $self     = shift;
    my $page     = shift;
    my $text     = shift;
    my $summary  = shift;
    my $is_minor = 1;
    my $res;
 
#     $text = encode( 'utf8', $text );
 
    my $options  = {
                    form_name => 'editform',
                    fields    => {
                                  wpSummary   => $summary,
                                  wpTextbox1  => $text,
                                 },
                   };
 
    $options->{fields}->{wpMinoredit} = 1 if ($is_minor);
 
    $res = $self->_put($page, $options);
    return $res;
}
 
sub get_history {
    my $self      = shift;
    my $pagename  = shift;
    my $limit     = shift || 5;
    my $rvstartid = shift || '';
    my $direction = shift;
 
    my @return;
    my @revisions;
 
    if ( $limit > 50 ) {
        $self->{errstr} = "Error requesting history for $pagename: Limit may not be set to values above 50";
        carp $self->{errstr};
        return 1;
    }
    my $query = "action=query&prop=revisions&titles=$pagename&rvlimit=$limit&rvprop=ids|timestamp|user|comment&format=xml";
    if ( $rvstartid ) {
        $query .= "&rvstartid=$rvstartid";
    }
    if ( $direction ) {
        $query .= "&rvdir=$direction";
    }
    my $res = $self->_get_api($query);
 
    unless ($res) { return 1; }
    my $xml = XMLin( $res->content );
 
    if ( ref( $xml->{query}->{pages}->{page}->{revisions}->{rev} ) eq "HASH" ) {
        $revisions[0] = $xml->{query}->{pages}->{page}->{revisions}->{rev};
    }
    else {
        @revisions = @{ $xml->{query}->{pages}->{page}->{revisions}->{rev} };
    }
 
    foreach my $hash ( @revisions ) {
        my $revid = $hash->{revid};
        my $user  = $hash->{user};
        my ( $timestamp_date, $timestamp_time ) = split( /T/, $hash->{timestamp} );
        $timestamp_time=~s/Z$//;
        my $comment = $hash->{comment};
        push ( @return, {
                revid          => $revid,
                user           => $user,
                timestamp_date => $timestamp_date,
                timestamp_time => $timestamp_time,
                comment        => $comment,
        } );
    }
    return @return;
}
 
sub get_text {
    my $self     = shift;
    my $pagename = shift;
    my $revid    = shift || '';
    my $section  = shift || '';
    my $recurse  = shift || 0;
 
    my $wikitext = '';
    my $res;
 
    $res = $self->_get( $pagename, 'edit', "&oldid=$revid&section=$section" );
 
    unless ($res) { return 1; }
    if ($recurse) {
        until ( $res->content =~ m/var wgAction = "edit"/ ) {
            my $real_title;
            if ( $res->content =~ m/var wgTitle = "(.+?)"/ ) {
                $real_title = $1;
            }
            $res = $self->_get( $real_title, 'edit' );
        }
    }
    if ( $res->content =~ /<textarea.+?\s?>(.+)<\/textarea>/s ) {
                $wikitext = $1; 
    } else {
        $self->{errstr} = "Could not get_text for $pagename!";
        carp $self->{errstr};
    }
 
        return decode_entities($wikitext);
}
 
=item revert($pagename,$edit_summary,$old_revision_id)
 
Reverts the specified page to $old_revision_id, with an edit summary of $edit_summary.
 
=cut
 
sub revert {
    my $self     = shift;
    my $pagename = shift;
    my $summary  = shift;
    my $revid    = shift;
 
    return $self->_put(
        $pagename,
        {
            form_name => 'editform',
            fields    => { wpSummary => $summary, },
        },
        "&oldid=$revid"
    );
}
 
=item get_last($pagename,$username)
 
Returns the number of the last revision not made by $username.
 
=cut
 
sub get_last {
    my $self     = shift;
    my $pagename = shift;
    my $editor   = shift;
 
    my $revertto = 0;
 
    my $res =
      $self->_get_api( "action=query&prop=revisions&titles=$pagename&rvlimit=20&rvprop=ids|user&rvexcludeuser=$editor&format=xml" );
    unless ($res) { return 1; }
    my $xml = XMLin( $res->content );
    if( ref( $xml->{query}->{pages}->{page}->{revisions}->{rev} ) eq 'ARRAY' ) {
                $revertto = $xml->{query}->{pages}->{page}->{revisions}->{rev}[0]->{revid};
        }
        else {
                $revertto = $xml->{query}->{pages}->{page}->{revisions}->{rev}->{revid};
        }
    return $revertto;
}
 
sub update_rc {
    my $self = shift;
    my $limit = shift || 5;
    my @rc_table;
 
    my $res =
      $self->_get_api(
        "action=query&list=recentchanges&rcnamespace=0&rclimit=$limit&format=xml");
    unless ($res) { return 1; }
 
    my $xml = XMLin( $res->content );
    foreach my $hash ( @{ $xml->{query}->{recentchanges}->{rc} } ) {
        my ( $timestamp_date, $timestamp_time ) = split( /T/, $hash->{timestamp} );
        $timestamp_time =~ s/Z$//;
        push( @rc_table, {
                        pagename       => $hash->{title},
                        revid          => $hash->{revid},
                        oldid          => $hash->{old_revid},
                        timestamp_date => $timestamp_date,
                        timestamp_time => $timestamp_time,
                        }
        );
    }
 
    return @rc_table;
}
 
sub last_contrib {
    my $self = shift;
    my $limit = shift || 5;
    my @rc_table;
 
    my $res =
      $self->_get_api(
        "action=query&list=usercontribs&format=xml&ucuser=".$default_username);
 
    unless ($res) { return 1; }
 
    my $xml = XMLin( $res->content );
    foreach my $hash ( @{ $xml->{query}->{usercontribs}->{item} } ) {
        my ( $timestamp_date, $timestamp_time ) = split( /T/, $hash->{timestamp} );
        $timestamp_time =~ s/Z$//;
        push( @rc_table, {
                        title          => $hash->{title},
                        revid          => $hash->{revid},
                        comment        => $hash->{comment},
                        timestamp_date => $timestamp_date,
                        timestamp_time => $timestamp_time,
                        }
        );
    }
 
    return @rc_table;
}
 
sub what_links_here {
    my $self    = shift;
    my $article = shift;
    my @links;
 
    my $continue;
    my $xml;
    do {
        my $res = $self->_get_api( "action=query&format=xml&blnamespace=0&blfilterredir=nonredirects&list=backlinks&bllimit=".$limit."&bltitle=".$article.($continue ? "&blcontinue=".$continue : "") );
 
        unless ($res) { return 1; }
 
        $xml = XMLin( $res->content );
 
        foreach my $hash ( @{ $xml->{query}->{backlinks}->{bl} } ) {
            push( @links,  {
                title          => $hash->{title},
            }
                  );
        }
    } while ($continue = $xml->{"query-continue"}->{backlinks}->{blcontinue} );
 
    return @links;
}
 
sub embedded_in {
    my $self    = shift;
    my $article = shift;
    my @links;
 
    my $continue;
    my $xml;
    do {
        my $res = $self->_get_api( "action=query&format=xml&eifilterredir=nonredirects&list=embeddedin&eilimit=".$limit."&eititle=".$article.($continue ? "&eicontinue=".$continue : "") );
 
        unless ($res) { return 1; }
 
        $xml = XMLin( $res->content );
 
        foreach my $hash ( @{ $xml->{query}->{embeddedin}->{ei} } ) {
            push( @links,  {
                title          => $hash->{title},
            }
                  );
        }
    } while ($continue = $xml->{"query-continue"}->{embeddedin}->{eicontinue} );
 
    return @links;
}
 
sub get_pages_in_category {
    my $self     = shift;
    my $category = shift;
 
    my @pages;
    my $res = $self->_get( $category, 'view' );
    unless ($res) { return 1; }
    my $content = $res->content;
    while ( $content =~ m{href="(?:[^"]+)/Category:[^"]+">([^<]*)</a></div>}ig )
    {
        push @pages, 'Category:' . $1;
    }
    while ( $content =~
        m{<li><a href="(?:[^"]+)" title="([^"]+)">[^<]*</a></li>}ig ) {
        push @pages, $1;
    }
    while ( my $res = $self->{mech}->follow_link( text => '200 suivants' ) ) {
        sleep 1;    #Cheap hack to make sure we don't bog down the server
        my $content = $res->content;
        while ( $content =~
            m{<li><a href="(?:[^"]+)" title="([^"]+)">[^<]*</a></li>}ig ) {
            push @pages, $1;
        }
    }
    return @pages;
}
 
sub get_all_pages_in_category {
    my $self          = shift;
    my $base_category = shift;
    my @first         = $self->get_pages_in_category($base_category);
    my %data;
    foreach my $page (@first) {
        $data{$page} = '';
        if ( $page =~ /^Category:/ ) {
            my @pages = $self->get_all_pages_in_category($page);
            foreach (@pages) {
                $data{$_} = '';
            }
        }
    }
    return keys %data;
}
 
sub linksearch {
    my $self = shift;
    my $link = shift;
    my @links;
    my $res =
      $self->_get( "Special:Linksearch", "edit", "&target=$link&limit=500" );
    unless ($res) { return 1; }
    my $content = $res->content;
    while ( $content =~
        m{<li><a href.+>(.+?)</a> linked from <a href.+>(.+)</a></li>}g ) {
        push( @links, { link => $1, page => $2 } );
    }
    while ( my $res = $self->{mech}->follow_link( text => 'next 500' ) ) {
        sleep 2;
        my $content = $res->content;
        while ( $content =~
            m{<li><a href.+>(.+?)</a> linked from <a href=.+>(.+)</a></li>}g ) {
            push( @links, { link => $1, page => $2 } );
        }
    }
    return @links;
}
 
sub purge_page {
    my $self = shift;
    my $page = shift;
    my $res  = $self->_get( $page, 'purge' );
 
}
 
sub get_namespace_names {
        my $self = shift;
        my %return;
        my $res = $self->_get_api("action=query&meta=siteinfo&siprop=namespaces&format=xml");
        my $xml = XMLin( $res->content );
 
        foreach my $id ( keys %{ $xml->{query}->{namespaces}->{ns} } ) {
                $return{$id} = $xml->{query}->{namespaces}->{ns}->{$id}->{content};
        }
        return %return;
}
 
sub encyclo {
    my $self = shift;
    my @results;
 
    foreach my $page (@_) {
        if ($page =~ /:/ ) {
            if ( $page =~ /modèle:/i || $page =~ /template:/i ) {
                push(@results, $page);
            }
        } else {
            push(@results, $page);
        }
    }
 
    return @results;
}
 
1;