# Richard A. DeVenezia # www.devenezia.com # # usage: table-extract.pl url "headings" # # headings is spaced separated list of regular expressions that the # first line of the table of interest should match # # Jan 16, 2004 - See SAS-L thread [Reading the data from the Internet] # use LWP::Simple; use LWP::UserAgent; use HTML::TableExtract; my $url = shift; my $headings = shift; # Spoof the user agent, some servers don't like LWP::Simple my $ua = new LWP::UserAgent; $ua->agent("Mozilla/5.0"); my $request = new HTTP::Request('GET', $url); my $response = $ua->request($request); die $! unless $response->is_success; $html_string = $response->content; $te = new HTML::TableExtract( headers => [split(/ /,$headings)] ); $te->parse($html_string); # Examine all matching tables foreach $ts ($te->table_states) { print "Table (", join(',', $ts->coords), "):\n"; foreach $row ($ts->rows) { print join(',', @$row), "\n"; } }