package GCPlugins::GCbooks::GCAmazon; ################################################### # # Copyright 2005-2009 Tian # # This file is part of GCstar. # # GCstar is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # GCstar is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with GCstar; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA # ################################################### use strict; use utf8; use GCPlugins::GCbooks::GCbooksCommon; { package GCPlugins::GCbooks::GCPluginAmazon; use base qw(GCPlugins::GCbooks::GCbooksPluginsBase); use XML::Simple; use LWP::Simple qw($ua); use Encode; use HTML::Entities; use GCUtils; sub start { my ($self, $tagname, $attr, $attrseq, $origtext) = @_; $self->{inside}->{$tagname}++; if ($self->{parsingList}) { # Identify beginning of comments if (($self->{isComment} == 0) && ($tagname eq 'varkcomment')) { $self->{isComment} = 1 ; } # Capture URL of book if (($self->{isComment} == 0) && ($self->{isUrl} == 1) && ($tagname eq 'a')) { $self->{itemsList}[$self->{itemIdx}]->{url} = $attr->{href}; $self->{isUrl} = 0 ; $self->{isTitle} = 1 ; return; } # Identify beginning of new book (next text is title) if (($self->{isComment} == 0) && ($tagname eq 'li') && ($attr->{id} =~ /result_[0-9]+/ )) { # Create new entry $self->{itemIdx}++; $self->{isUrl} = 1 ; $self->{isAuthor} = 0 ; return ; } # Identify end of authors list if (($self->{isComment} == 0) && ($tagname eq 'varkendauthors') && ($self->{isAuthor} != 0)) { $self->{isAuthor} = 0 ; return ; } } else { # Detection of book themes if (($self->{isTheme} == 0) && ($tagname eq 'varkgenre')) { $self->{isTheme} = 1 ; return ; } # Detection of book page count if (($self->{isPage} == 0) && ($tagname eq 'varkdata')) { $self->{isPage} = 1 ; return ; } # Detection of authors if ($tagname eq 'varkauthor') { $self->{isAuthor} = 1; return ; } # Capture of image if ($tagname eq 'varkimage') { $attr->{adress} =~ /http.*?\.jpg/; $attr->{adress} =~ s|https://images-na.ssl-images-amazon.com/images/I/|http://z2-ec2.images-amazon.com/images/I/|; $self->{curInfo}->{cover} = $attr->{adress}; return ; } # Detection of book description if (($self->{isDescription} == 0) && ($tagname eq 'varkdescription')) { $self->{isDescription} = 1 ; return ; } if (($self->{isDescription} == 1) && ($tagname eq 'div')) { $self->{isDescription} = 2 ; return ; } # Detection title if (($self->{isTitle} == 0) && ($tagname eq 'varktitle')) { $self->{isTitle} = 2 ; return ; } } } sub end { my ($self, $tagname) = @_; $self->{inside}->{$tagname}--; if ($self->{parsingList}) { # Identify end of comments if (($self->{isComment} == 1) && ($tagname eq 'varkcomment')) { $self->{isComment} = 0 ; } } else { # Finishing themes analysis if (($self->{isTheme} != 0) && ($tagname eq 'li')) { $self->{isTheme} = 0 ; return ; } # Finishing description analysis if (($self->{isDescription} != 0) && ($tagname eq 'div')) { $self->{isDescription} = 0 ; return ; } } } sub text { my ($self, $origtext) = @_; if ($self->{parsingList}) { # Remove blanks before and after string $origtext =~ s/^\s+//; $origtext =~ s/\s+$//g; # Capture of book title if (($self->{isComment} == 0) && ($self->{isTitle} == 1) && ($origtext ne '')) { $self->{itemsList}[$self->{itemIdx}]->{title} = $origtext; $self->{isTitle} = 0 ; $self->{isPublication} = 1 ; return ; } # Capture of book publication date if (($self->{isComment} == 0) && ($self->{isPublication} == 1) && ($origtext ne '')) { $self->{itemsList}[$self->{itemIdx}]->{publication} = $origtext; $self->{isAuthor} = 1 ; $self->{isPublication} = 0 ; return ; } # Avoid a text area before the first author if (($self->{isComment} == 0) && ($self->{isAuthor} == 1) && ($origtext ne '')) { $self->{isAuthor} = 2 ; return ; } # Capture of authors if (($self->{isComment} == 0) && ($self->{isAuthor} == 2) && ($origtext ne '')) { if ($self->{itemsList}[$self->{itemIdx}]->{authors} eq '') { $self->{itemsList}[$self->{itemIdx}]->{authors} = $origtext; } else { $self->{itemsList}[$self->{itemIdx}]->{authors} .= " " . $origtext; } return; } } else { # Remove blanks before and after string $origtext =~ s/^\s+//; $origtext =~ s/\s+$//g; # Capture of title if (($self->{isTitle} == 2) && ($origtext ne '')) { $self->{isTitle} = 0 ; $self->{curInfo}->{title} = $origtext; return ; } # Capture of page number if (($self->{isPage} == 1) && ($origtext =~ /^[0-9]+/)) { $self->{curInfo}->{pages} = $origtext; $self->{isPage} = 0 ; return ; } # Capture of editor and publication date if (($self->{isEditor} == 0) && ($origtext eq $self->getTranslation(1))) { $self->{isEditor} = 1 ; return ; } if (($self->{isEditor} == 1) && ($origtext ne '')) { my @array = split('\(',$origtext); $array[1] =~ s/\)//g; $array[0] =~ s/^\s+//; $array[0] =~ s/\s+$//g; $array[0] =~ s/\;//g; $array[1] =~ s/^\s+//; $array[1] =~ s/\s+$//g; $self->{curInfo}->{publisher} = $array[0]; $self->{curInfo}->{publication} = $array[1]; $self->{isEditor} = 0 ; return ; } # Capture of language if (($self->{isLanguage} == 0) && ($origtext eq $self->getTranslation(2))) { $self->{isLanguage} = 1 ; return ; } if (($self->{isLanguage} == 1) && ($origtext ne '')) { $self->{curInfo}->{language} = $origtext; $self->{isLanguage} = 0 ; return ; } # Capture of ISBN if (($self->{isISBN} == 0) && ($origtext eq $self->getTranslation(3))) { $self->{isISBN} =1 ; return ; } if (($self->{isISBN} == 1) && ($origtext ne '')) { $origtext =~ s|-||gi; $self->{curInfo}->{isbn} = $origtext; $self->{isISBN} = 0 ; return ; } # Capture of book dimensions if (($self->{isSize} == 0) && ($origtext eq $self->getTranslation(4))) { $self->{isSize} = 1 ; return ; } if (($self->{isSize} == 1) && ($origtext ne '')) { $self->{curInfo}->{format} = $origtext; $self->{isSize} = 0 ; return ; } # Detection of themes if (($origtext eq '>') && ($self->{isTheme} == 1)) { $self->{isTheme} = 2 ; return ; } # Capture of themes if (($self->{isTheme} == 2) && ($origtext ne '')) { if ($self->{curInfo}->{genre} eq '') { $self->{curInfo}->{genre} = $origtext; } else { $self->{curInfo}->{genre} .= ", " . $origtext; } $self->{isTheme} = 1 ; return; } # Capture of authors if (($self->{isAuthor} == 1) && ($origtext ne '') && ($origtext =~ /^(?:(?!Ajax).)*$/)) { # Lower case for author names, except for first letters $origtext =~ s/([[:alpha:]]+)/ucfirst(lc $1)/egi; if ($self->{curInfo}->{authors} eq '') { $self->{curInfo}->{authors} = $origtext; } else { $self->{curInfo}->{authors} .= ", " . $origtext; } $self->{isAuthor} = 0 ; return; } # Capture of description if (($self->{isDescription} == 2) && ($origtext ne '')) { if ($self->{curInfo}->{description} eq '') { $self->{curInfo}->{description} = $origtext; } else { $self->{curInfo}->{description} .= $origtext; } return ; } } } sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(); bless ($self, $class); $self->{hasField} = { title => 1, authors => 1, publication => 1, format => 0, edition => 0, }; $self->{isComment} = 0; $self->{isUrl} = 0; $self->{isTitle} = 0; $self->{isPublication} = 0; $self->{isAuthor} = 0; $self->{isPage} = 0; $self->{isEditor} = 0; $self->{isISBN} = 0; $self->{isDescription} = 0; $self->{isLanguage} = 0 ; $self->{isTheme} = 0 ; return $self; } sub getItemUrl { my ($self, $url) = @_; return $url; } sub preProcess { my ($self, $html) = @_; if ($self->{parsingList}) { # Analysis of results must be disabled during comments $html =~ s|||gi; # Remove other commercial offers $html =~ s|END SPONSORED LINKS SCRIPT.*||s; # End of authors listing detection $html =~ s|
||gi; $html =~ s|

||gi; $html =~ s|
||gi; } else { # Beginning of book data : pages, editor, publication date, ISBN, dimensions $html =~ s|||gi; # Beginning and end of book description $html =~ s|