|
呵呵。
我做的很粗。
你可以在想点办法,比如替换掉我用HTML::TreeBuilder的地方,
使用HTML:arser来换掉它,然后可以存储到你自己的数据库里。
哈哈。
[PHP]
#!/usr/bin/perl
use strict;
use warnings;
use Encode qw(EFAULT from_to);
use LWP;
use HTTP::Cookies;
use HTTP::Request::Common;
use HTML::TreeBuilder;
use HTML:arser;
use HTML::FormatText;
use DBI;
use vars qw($article_id $bbs_id $title $bbs $parser);
my $totalpage = 532;
my $dbh = DBI->connect('dbi:mysql:host=你的主机;database=你数据库名', '你的用户名', '你的密码');
my $url = "http://bbs.chinaunix.net/digest.php?page=";
my $login_url = "http://bbs.chinaunix.net/logging.php?action=login";
# at the first, we must login
my $username = "CU的用户名,自己申请一个";
my $passwd = "CU的用户密码,同上";
my $browser = LWP::UserAgent->new;
# set the cookie
$browser->cookie_jar(HTTP::Cookies->new(
'file'=>'/home/ghw/myperl/ChinaUnix/lwp',
'autosave'=>1));
my $new_agent = 'myremote/1.0';
$browser->agent($new_agent);
push @{$browser->requests_redirectable}, "OST";
my $response = $browser->post($login_url,
[
formhash=>'eb595b8d',
loginmode=>'normal',
styleid=>1,
cookietime=>2592000,
loginfield=>'username',
questionid=>0,
referer=>'index.php',
username=>$username,
password=>$passwd,
loginsubmit=>'登陆',
]
);
die $response->message unless $response->is_success;
for ( my $i=4; $i<=$totalpage; $i++ ) {
my $mypage = $url.$i;
print $mypage, "\n";
my $request = GET($mypage);
$parser = HTML:arser->new(api_version=>3);
$parser->handler(start=>\&dostart, 'self,tagname,attr');
$parser->handler(text=>undef);
$parser->handler(end=>undef);
$response = $browser->request($request, sub { $parser->parse(shift);} );
$parser->eof;
}
sub dostart {
my ( $html, $tag, $attr ) = @_;
$article_id = 0;
$bbs_id = 0;
$title = "";
if ( defined $tag && $tag eq 'a' ) {
if ( defined $attr->{href} &&
$attr->{href} =~ /viewthread\.php\?tid=(.*)&fpage=1&highlight=/ ) {
from_to($tag, 'gbk', 'UTF-8', 1);
$article_id = $1;
$html->{last_tag} = $tag;
$html->handler(text=>\&do_text, 'self,dtext');
$html->handler(end=>\&do_end, 'self');
}
}
}
sub do_text {
my ( $html, $text ) = @_;
$text =~ s/^\s+//;
$text =~ s/\s+$//;
from_to($text, 'gbk', 'gb2312', 1);
$title = $text;
my $request = GET("http://bbs.chinaunix.net/viewthread.php?tid=".$article_id);
my $tree = HTML::TreeBuilder->new;
my $temp_res = $browser->request($request, sub { $tree->parse(shift);});
$tree->eof;
my $format = HTML::FormatText->new;
my $str = $format->format($tree);
from_to($str, 'gbk', 'gb2312');
my $bbs_id = 108;
my $sql = "select count(id) as num from article where title=?";
my $sth = $dbh->prepare($sql);
$sth->bind_param(1, $title);
$sth->execute or die "$@\n";
$sql = "insert into article (mid, title, fromaddr, content, submit_user, submit_date ) values (?, ?, ?, ?, ?, ? )";
$sth = $dbh->prepare($sql);
$sth->bind_param(1, 108);
$sth->bind_param(2, $title);
$sth->bind_param(3, 'ChinaUnix.net');
$sth->bind_param(4, $str);
my $name = "你自己的名字";
from_to($name, 'UTF-8', 'gb2312');
$sth->bind_param(5, $name);
$sth->bind_param(6, '2006-1-27 15:42:00');
$sth->execute or die "$@\n";
}
sub do_end {
my $html = shift;
undef $html->{last_tag};
$html->handler(text=>undef);
$html->handler(end=>undef);
}
[/PHP] |
|