package GCPlugins::GCfilms::GCAllocine; ################################################### # # Copyright 2005-2010 Christian Jodar # Copyright 2015-2016 Kérénoc (kerenoc01 à Google Mail) # # This file is part of GCstar. # # GCstar is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # GCstar is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with GCstar; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA # ################################################### use strict; use utf8; use GCPlugins::GCfilms::GCfilmsCommon; { package GCPlugins::GCfilms::GCPluginAllocine; use base qw(GCPlugins::GCfilms::GCfilmsPluginsBase); sub start { my ($self, $tagname, $attr, $attrseq, $origtext) = @_; $self->{inside}->{$tagname}++; if ($self->{parsingList}) { if ($self->{insideResults} eq 1) { if ( ($tagname eq "a") && ($attr->{href} =~ /^\/film\/fichefilm_gen_cfilm=/) && ($self->{isMovie} eq 0)) { my $url = $attr->{href}; $self->{isMovie} = 1; $self->{isInfo} = 0; $self->{itemIdx}++; $self->{itemsList}[ $self->{itemIdx} ]->{url} = $url; } elsif (($tagname eq "td") && ($self->{isMovie} eq 1)) { $self->{isMovie} = 2; } elsif (($tagname eq "a") && ($self->{isMovie} eq 2)) { $self->{isMovie} = 3; } elsif (($tagname eq "br") && ($self->{isMovie} eq 3)) { $self->{itemsList}[ $self->{itemIdx} ]->{title} =~ s/^\s*//; $self->{itemsList}[ $self->{itemIdx} ]->{title} =~ s/\s*$//; $self->{itemsList}[ $self->{itemIdx} ]->{title} =~ s/\s+/ /g; $self->{isMovie} = 4; } elsif (($tagname eq "span") && ($attr->{class} eq "fs11") && ($self->{isMovie} eq 4)) { $self->{isInfo} = 1; $self->{isMovie} = 0; } elsif (($tagname eq "br") && ($self->{isInfo} eq 1)) { $self->{isInfo} = 2; } elsif (($tagname eq "br") && ($self->{isInfo} eq 2)) { $self->{isInfo} = 3; } } } else { if (($tagname eq "span") && ($attr->{class} eq "thumbnail-link")) { $self->{insidePicture} = 1; } elsif (($tagname eq "img") && ($self->{insidePicture} eq 1)) { my $src = $attr->{src}; if (!$self->{curInfo}->{image}) { $self->{curInfo}->{image} = $src; } $self->{insidePicture} = 0; } elsif ($tagname eq "h1") { $self->{insideTitle} = 1; } elsif (($tagname eq "span") && ($self->{insideDate} eq 1)) { $self->{insideDate} = 2; } elsif (($tagname eq "span") && ($attr->{itemprop} eq "director")) { $self->{insideDirector} = 1; } elsif (($tagname eq "span") && ($attr->{itemprop} eq "duration")) { $self->{insideTime} = 1; } elsif (($tagname eq "span") && ($self->{insideDirector} eq 1)) { $self->{insideDirector} = 2; } elsif (($tagname eq "div") && ($attr->{itemprop} eq "actor") && !$self->{curInfo}->{nextUrl}) { # recuperation des acteurs uniquement dans la page du casting : nextUrl = 0 $self->{insideActor} = 1; } elsif (($tagname eq "span") && ($attr->{itemprop} eq "name") && ($self->{insideActor} eq 1)) { $self->{insideActor} = 2; # item where the actor name is followed by role : name part } elsif (($tagname eq "span") && ($attr->{class} =~ m/col-xs/) && ($self->{insideActor} eq 1)) { $self->{insideActor} = 3; # item where the role is followed by actor name : role part } elsif (($tagname eq "span") && ($self->{insideGenre} eq 1)) { $self->{insideGenre} = 2; } elsif (($tagname eq "span") && ($self->{insideCountry} eq 1)) { $self->{insideCountry} = 2; } elsif (($tagname eq "span") && ($attr->{class} eq "stareval-note") && ($self->{insidePressRating} eq 1)) { $self->{insidePressRating} = 2; } elsif (($tagname eq "div") && ($attr->{class} eq "breaker")) { $self->{insidePressRating} = 0; } elsif (($tagname eq "div") && ($attr->{itemprop} eq "description")) { $self->{insideSynopsis} = 1; } elsif (($tagname eq "span") && ($self->{insideOriginal} eq 1)) { $self->{insideOriginal} = 2; } } } sub end { my ($self, $tagname) = @_; $self->{inside}->{$tagname}--; if ($tagname eq "li") { $self->{insideDirector} = 0; $self->{insideGenre} = 0; } elsif ($tagname eq "div") { $self->{insideCountry} = 0; $self->{insideSynopsis} = 0; $self->{insideGenre} = 0; } elsif ($tagname eq "th") { $self->{insideSynopsis} = 0; } elsif ($tagname eq "table") { $self->{insideResults} = 0; } } sub text { my ($self, $origtext) = @_; if ($self->{parsingList}) { if (($origtext =~ m/(\d+) r..?sultats? trouv..?s? dans les titres de films/) && ($1 > 0)) { $self->{insideResults} = 1; } if ($self->{isMovie} eq 3) { $self->{itemsList}[ $self->{itemIdx} ]->{title} .= $origtext; } if ($self->{isInfo} eq 1) { if ($origtext =~ /\s*([0-9]{4})/) { $self->{itemsList}[ $self->{itemIdx} ]->{date} = $1; } } elsif ($self->{isInfo} eq 2) { if ($origtext =~ /^\s*de (.*)/) { $self->{itemsList}[ $self->{itemIdx} ]->{director} = $1; } } elsif ($self->{isInfo} eq 3) { if ( ($origtext =~ m/^\s*avec (.*)/) && (!$self->{itemsList}[ $self->{itemIdx} ]->{actors})) { $self->{itemsList}[ $self->{itemIdx} ]->{actors} = $1; } $self->{isInfo} = 0; } } else { my ($self, $origtext) = @_; $origtext =~ s/[\r\n]//g; $origtext =~ s/^\s*//; $origtext =~ s/\s*$//; if ($self->{insideTitle} eq 1) { # two pass plugin : {title} is set in the first pass if (! $self->{curInfo}->{title}) { # loading second web page for casting my $fileCasting = $self->{curInfo}->{$self->{urlField}}; $fileCasting =~ s/_gen_cfilm=/-/; $fileCasting =~ s/.html/\/casting/; $self->{curInfo}->{nextUrl} = $fileCasting; } $self->{curInfo}->{title} = $origtext if (! $self->{curInfo}->{title}); $self->{insideTitle} = 0; } elsif (($self->{insideDate} eq 2) && (length($origtext) > 1)) { $self->{curInfo}->{date} = $self->decodeDate($origtext) if !($origtext =~ /inconnu/); $self->{insideDate} = 0; $self->{insideTime} = 1; } elsif ($self->{insideTime} eq 1) { $origtext =~ s/^\s+//; $origtext =~ s/\(//; $origtext =~ s/min\)//g; my $hours = $origtext; $hours =~ s/h.*//; my $minutes = $origtext; $minutes =~ s/.*h *//; $self->{curInfo}->{time} = $hours * 60 + $minutes; $self->{insideTime} = 0; } elsif (($origtext =~ /^Date de sortie/) && (!$self->{curInfo}->{date})) { $self->{insideDate} = 1; } elsif (($origtext =~ /^Date de reprise/) && (!$self->{curInfo}->{date})) { $self->{insideDate} = 1; } elsif ($self->{insideTime} eq 1) { $origtext =~ /(\d+)h\s*(\d+)m/; my $time = ($1*60) + $2; $self->{curInfo}->{time} = $time." m."; $self->{insideTime} = 0; } elsif ($self->{insideDirector} eq 2) { if ($self->{curInfo}->{director}) { $self->{curInfo}->{director} .= ", ".$origtext; } else { $self->{curInfo}->{director} .= $origtext; } $self->{insideDirector} = 0; } elsif ($self->{insideGenre} eq 2) { $origtext = "," if $origtext =~ m/^,/; $self->{curInfo}->{genre} .= $origtext; } elsif ($origtext =~ /^[\s\n]*Genre/) { $self->{insideGenre} = 1; } elsif ($self->{insideCountry} eq 2) { $origtext = "," if $origtext =~ m/^,/; $self->{curInfo}->{country} .= $origtext; } elsif ($self->{insideActor} > 1) { $origtext =~ s/\s*plus\s*//; $origtext =~ s/\s*Rôle\s*:\s*//; return if ($origtext eq "," || $origtext eq '' ); if ($self->{insideActor} eq 2) { $self->{actor} = $origtext; $self->{insideActor} = 3 if (!$self->{role}); } elsif ($self->{insideActor} eq 3) { $self->{role} = $origtext; $self->{insideActor} = 2 if (!$self->{actor}); } if ($self->{actor} && $self->{role}) { push @{$self->{curInfo}->{actors}}, [$self->{actor}]; push @{$self->{curInfo}->{actors}->[$self->{actorsCounter}]}, $self->{role}; $self->{actorsCounter}++; $self->{actor} = ""; $self->{role} = ""; $self->{insideActor} = 0; } } elsif ($origtext =~ /Nationalité/) { $self->{insideCountry} = 1; } elsif ($origtext =~ /^Presse$/) { $self->{insidePressRating} = 1; } elsif ($self->{insidePressRating} eq 2) { $origtext =~ s/,/./; $self->{curInfo}->{ratingpress} .= $origtext * 2; $self->{insidePressRating} = 0; } elsif ($origtext =~ m/^Interdit aux moins de (\d+) ans/) { $self->{curInfo}->{age} = $1; } elsif ($self->{insideSynopsis} eq 1) { $self->{curInfo}->{synopsis} .= $origtext; } elsif ($self->{insideOriginal} eq 2) { $self->{curInfo}->{original} = $origtext; $self->{insideOriginal} = 0; } elsif ($origtext =~ /^R..?alis..? par/) { $self->{insideDirector} = 1; } elsif ($origtext =~ m/Titre original/) { $self->{insideOriginal} = 1; } } } sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(); $self->{hasField} = { title => 1, date => 1, director => 1, actors => 1, }; $self->{isInfo} = 0; $self->{isMovie} = 0; $self->{insideResults} = 0; $self->{curName} = undef; $self->{curUrl} = undef; $self->{actorsCounter} = 0; bless($self, $class); return $self; } sub preProcess { my ($self, $html) = @_; return $html; } sub getSearchUrl { my ($self, $word) = @_; # f=3 ? # return "http://www.allocine.fr/recherche/?q=$word&f=3&rub=1"; return "http://www.allocine.fr/recherche/1/?q=$word"; } sub getSearchCharset { my $self = shift; # Need urls to be double character encoded return "utf8"; } sub getItemUrl { my ($self, $url) = @_; return "http://www.allocine.fr" . $url; } sub getName { return "Allocine.fr"; } sub getAuthor { return 'Tian - Kerenoc'; } sub getLang { return 'FR'; } sub getCharset { # return "UTF-8"; # For 1.5.0 Win32 return "ISO-8859-1"; # For 1.5.0 Win32 with /lib/gcstar/GCPlugins/ ver.1.5.9svn } sub decodeDate { my ($self, $date) = @_; # date déjà dans le bon format return $date if ($date =~ m|/|); # date à convertir au format jour/mois/année my @dateItems = split(/\s/, $date); my @listeMois = ("janvier","f.*vrier","mars","avril","mai","juin", "juillet","ao.*t","septembre","octobre","novembre","décembre"); my $mois = 0; my $nbDates = (scalar @dateItems); while ($mois < (scalar @listeMois) && !($dateItems[$nbDates-2] =~ m/$listeMois[$mois]/)) { $mois++; } $mois++; return sprintf("%02d/%02d",$dateItems[0],$mois)."/".$dateItems[$nbDates-1] if ($nbDates > 2); return sprintf("01/%02d",$mois)."/".$dateItems[1] if ($nbDates eq 2); return ""; } } 1;