Web Scraping Proxy
by Howard P. Katseff 


Listing One

# Request: http://www.amazon.com/ 
$request = new HTTP::Request('GET' => "http://www.amazon.com/");
# HTTP/1.1 302 Found
# Set-Cookie: skin=; domain=.amazon.com; path=/; 
#                                expires=Wed, 01-Aug-01 12:00:00 GMT
# Request: http://www.amazon.com:80/exec/obidos/subst/home/home.html 
$request = new HTTP::Request('GET' => 
            "http://www.amazon.com:80/exec/obidos/subst/home/home.html");
# HTTP/1.1 302 Found
# Set-Cookie: session-id=102-0343904-7396174; path=/; domain=.amazon.com; 
#                                  expires=Tuesday, 06-Aug-2002 08:00:00 GMT
# Set-Cookie: session-id-time=1028620800; path=/; domain=.amazon.com; 
#                                  expires=Tuesday, 06-Aug-2002 08:00:00 GMT
# Request: 
#   http://www.amazon.com/exec/obidos/subst/home/home.html/102-0343904-7396174 
# Cookie: 'session-id', '102-0343904-7396174'
# Cookie: 'session-id-time', '1028620800'
$request = new HTTP::Request('GET' => 
"http://www.amazon.com/exec/obidos/subst/home/home.html/102-0343904-7396174");
# Set-Cookie: ubid-main=430-4160616-7432656; path=/; domain=.amazon.com; 
#                                 expires=Tuesday, 01-Jan-2036 08:00:01 GMT
# Set-Cookie: obidos_path=continue-shopping-url=/subst/home/home.html/
#   102-0343904-7396174&continue-shopping-post-data=
#   &continue-shopping-description=generic.gateway.default; 
#   path=/; domain=.amazon.com
# Table 1: 9 rows; table nesting: 3
# Table 2: 84 rows; table nesting: 5
# Table 3: 4 rows; table nesting: 2
# Table 4: 1 rows
# Contains JavaScript
# Saving web page as w0

# Request: http://www.amazon.com/exec/obidos/account-access-login/
#                             ref=top_nav_ya_gateway/102-0343904-7396174 
# Referer: http://www.amazon.com/exec/obidos/subst/home/home.html/
#                             102-0343904-7396174
# Cookie: 'session-id', '102-0343904-7396174'
# Cookie: 'session-id-time', '1028620800'
# Cookie: 'ubid-main', '430-4160616-7432656'
# Cookie: 'obidos_path', 'continue-shopping-url=/subst/home/home.html/
#     102-0343904-7396174&continue-shopping-post-data=
#      &continue-shopping-description=generic.gateway.default'
$request = new HTTP::Request('GET' => "http://www.amazon.com/exec/obidos/
           account-access-login/ref=top_nav_ya_gateway/102-0343904-7396174");
# Table 1: 7 rows; table nesting: 3
# Table 2: 39 rows; table nesting: 3
# Table 3: 1 rows
# Contains JavaScript
# Saving web page as w1


Listing Two

# Request: https://www.amazon.com/exec/obidos/flex-sign-in-done/
#                                                        103-6178643-7537408 
# Referer: http://www.amazon.com/exec/obidos/flex-sign-in/
#       103-6178643-7537408?page=help%2Fya-sign-in-secure.html
#       &response=order-history-filtered&method=POST&opt=ab&return-url=
#       order-history-filtered&ss-order-filter=year-2002&Go.x=13&Go.y=9
# Cookie: 'session-id', '103-6178643-7537408'
# Cookie: 'session-id-time', '1028620800'
# Cookie: 'ubid-main', '430-2320918-7404815'
# Cookie: 'obidos_path', 'continue-shopping-url=/subst/home/home.html/
#       103-6178643-7537408&continue-shopping-post-data=
#       &continue-shopping-description=generic.gateway.default'
$request = POST "https://www.amazon.com/exec/obidos/flex-sign-in-done/
     103-6178643-7537408" , [
    'Go.x' => "13",
    'Go.y' => "9",
    'method' => "POST",
    'opt' => "ab",
    'page' => "help/ya-sign-in-secure.html",
    'response' => "order-history-filtered",
    'return-url' => "order-history-filtered",
    'ss-order-filter' => "year-2002",
    'email' => "hpk1024@hotmail.com",
    'action' => "sign-in",
    'next-page' => "help/ya-register-secure.html",
    'password' => "mypassword",
    'x' => "159",
    'y' => "7",
] ;
# DIFFERENCES between form from server and submitted form:
$post_args = { };
$post_args->{'password'} = " mypassword ";  # was ""
$post_args->{'x'} = "159";  # was ""
$post_args->{'email'} = " hpk1024@hotmail.com";  # was ""
$post_args->{'y'} = "7";  # was ""
# end DIFFERENCES
# Set-Cookie: x-main=zXhR??@ELakCfL?rLjUW?yCkcMYNSl4d; path=/; 
#               domain=.amazon.com; expires=Tuesday, 01-Jan-2036 08:00:01 GMT
# Set-Cookie: auth-browser-session-main=ss; path=/; domain=.amazon.com
# Set-Cookie: x-main=zXhR??@ELakCfL?rLjUW?yCkcMYNSl4d; path=/; 
#               domain=.amazon.com; expires=Tuesday, 01-Jan-2036 08:00:01 GMT
# Table 1: 5 rows; table nesting: 3
# Table 2: 1 rows
# Table 3: 4 rows; table nesting: 3
# Table 4: 2 rows; table nesting: 2
# Table 5: 7 rows; table nesting: 5
# Table 6: 1 rows
# Table 7: 1 rows
# Contains JavaScript
# Saving web page as w3


Listing Three

use HTML::TableExtract;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST GET);
use LWP::UserAgent;
 
$ua = new LWP::UserAgent();
$jar = HTTP::Cookies->new();
$ua->cookie_jar($jar);
$ua->agent("Microsoft Internet Explorer/5.5");
 
$request = new HTTP::Request('GET' => "http://www.amazon.com/");
 
$webdoc = $ua->request($request);
die unless !$webdoc->is_success();
 
## [...several GET requests omitted...]

$request = POST "https://www.amazon.com/exec/obidos/flex-sign-in-done/
         103-617864 3-7537408" , [
        'Go.x' => "13",
        'Go.y' => "9",
        'method' => "POST",
        'opt' => "ab",
        'page' => "help/ya-sign-in-secure.html",
        'response' => "order-history-filtered",
        'return-url' => "order-history-filtered",
        'ss-order-filter' => "year-2002",
        'email' => " hpk1024\@hotmail.com ",
        'action' => "sign-in",
        'next-page' => "help/ya-register-secure.html",
        'password' => " mypassword",
        'x' => "159",
        'y' => "7",
] ;
$webdoc = $ua->request($request);
die unless !$webdoc->is_success();
 
# obtain information from $webdoc->content()
# HTML::TableExtract() might be useful here


Listing Four

use HTML::TableExtract;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST GET);
use LWP::UserAgent;
 
$ua = new LWP::UserAgent();
$jar = HTTP::Cookies->new();
$ua->cookie_jar($jar);
$ua->agent("Microsoft Internet Explorer/5.5");
$request = new HTTP::Request('GET' => "http://www.amazon.com/");
$webdoc = $ua->request($request);
die unless !$webdoc->is_success();
 
## [...several GET requests omitted...]

$post_args = { };
$post_args->{'password'} = " mypassword ";  # was ""
$post_args->{'x'} = "159";  # was ""
$post_args->{'email'} = " hpk1024\@hotmail.com";  # was ""
$post_args->{'y'} = "7";  # was ""

# use form from previous GET request
my $form = HTML::Form->parse($webdoc->content(), "http://www.amazon.com");
for $input ($form->inputs)
{
    unless (defined $post_args->{$input->name})
    {
        my @fnv = $input->form_name_value();
        while (my $fname = shift @fnv)
        {
            $post_args->{$fname} = shift @fnv;
        }
    }
 }
$request = POST "https://www.amazon.com/exec/obidos/flex-sign-in-done/
         103-617864 3-7537408" , $post_args;
$webdoc = $ua->request($request);
die unless !$webdoc->is_success();
 
# obtain information from $webdoc->content()
# HTML::TableExtract() might be useful here



4


