strawberry perl bin perl exe -w use strict use warnings modules use LW

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
#!c:\strawberry\perl\bin\perl.exe -w
#
#*
use strict;
use warnings;
#* modules
use LWP;
use HTML::TokeParser;
use Encode;
#* set locale
use locale;
use POSIX qw(locale_h);
setlocale(LC_CTYPE, 'ru_RU.UTF-8');
setlocale(LC_ALL, 'ru_RU.UTF-8');
#* my user agent
my $ua = LWP::UserAgent->new;
#$ua->proxy("http", "http://192.168.200.96:8083");
#$ua->env_proxy;
#* system variables
#* debug
my $debug = 1; # 0 - off; 1 - on
#* define global variable
my $base_url = "http://uln.e-adres.ru";
my @first_level; # 1st level cat. array: @first_level[name][link]
my @second_level; # 2nd level cat. array: @second_leve[name][link][parent]
my @third_level; # 3rd level cat. array: @third_level[info][parent]
sub getUri() {
my $uri = shift @_;
my $resp = $ua->get($uri);
if (!$resp->is_success) {
print $resp->status_line;
return -1;
}
my $out = $resp->decoded_content;
return $out;
}
sub getPage() {
my $uri = shift @_;
my $res = &getUri($uri);
my $ndx = 0; # name index
#my $filename = "temp/$uri-out.html";
my $filename = "temp/$ndx-out.html";
$ndx++;
# $filename =~ /(\S+)http:\/\/(\S+)/;
# my $tmp1 = $1;
# my $tmp2 = $2;
# $tmp2 =~ s/\//_/g;
# $filename = $tmp1 . $tmp2;
# print "\n>> getPage($uri): $filename + $tmp2\n" if $debug;
open (FILE, ">$filename") or die "Couldn't open file: $!";
print FILE $res;
close (FILE);
return $filename;
}
sub getData() {
my $uri = shift @_;
#my $filename = &getPage($uri); # get filename
my $filename = &getUri($uri);
my $page = HTML::TokeParser->new(\$filename) or die "Error 1\n";
print "\n>> getData(): $uri\n" if $debug;
# not necessary if $filename is not a file name (=
# not necessary in this case while getPage() subroutine not used
#print "\n>> getData($uri): $filename" if $debug;
while (my $token = $page->get_tag("a")) {
my $link = $token->[1]{href};
my $text = $page->get_trimmed_text("/a");
next if !defined($link);
if ($link =~ /rubrics/g) {
my $goto = $base_url . $link;
print ">> getData(): while > goto: $goto\n" if $debug;
#&getData($goto);
}
}
return 0;
}
&getData($base_url);