#!/usr/local/bin/perl # # curlmirror.pl # # Mirrors a web site by using curl to download each page. # The result is stored in a directory named "dest" by default. # Temporary files are stored in "/tmp". # # Author: Kjell.Ericson@haxx.se # # Limitations: # All links are right now based from the root, so there are a lot # of "../../" in pages. # # History: # # 1999-11-19 v0.9 - Kjell Ericson - First version # 1999-11-22 v0.10 - Kjell Ericson - Added some more flags # 1999-12-06 v0.11 - Kjell Ericson - Relative paths were not correctecd # 1999-12-06 v1.0 - Kjell Ericson - Satisfied and updated to v1.0 # 1999-12-07 v1.1 - Kjell Ericson - Added "-p" # 1999-12-08 v1.2 - Kjell Ericson - Added "-l" and "-c" # 1999-12-13 v1.3 - Kjell Ericson - Added match for images in stylesheets # 2000-08-07 v1.4 - Kjell Ericson - Handles both ' and " in links. # 2000-08-15 v1.5 - Kjell Ericson - Added -I. # 2000-08-16 v1.6 - Kjell Ericson - Added multiple -I and -B. # 2002-01-23 v1.7 - Anthony Thyssen - Changed the destination filename # 2002-07-14 v1.8 - Kjell Ericson - Corrected a temp-filename error # $max_deep=1000; $max_size=2; $dest_dir="dest"; $default_name="index.html"; $tmp="/tmp"; $filecounter=0; # For faster handling we have this regex: $nonhtmlfiles="jpg|gif|png|zip|doc|txt|pdf|exe|java"; $help= "Usage: curlmirror.pl [flags] [url]\n". "\n". "-a : Curl specific arguments\n". "-B : Only retrieve URL below this URL (default is [url]).\n". "-b : Pattern that will be stripped from filename.\n". "-c : Ignore CGI's (i.e URL's with '?' in them) (default off).\n". "-d : Depth to scan on (default unlimited).\n". "-f : Flat directory structure is made (be careful).\n". "-F : Flat directory structure but use path in filename.\n". "-i : Default name for unknown filenames (default is 'index.html').\n". "-I : Don't handle files matching this pattern (default is \"\".\n". "-l : Only load HTML-pages - no images (default is to load all).\n". "-o : Directory to output result in (default is 'dest').\n". "-s : Max size in Mb of downloaded data (default 2 Mb)\n". "-p : Always load images (default is not to).\n". "-t : Temporary directory (default is '/tmp').\n". "-v : Verbose output.\n". "\n". "Example:\n". "curlmirror.pl http://www.perl.com/\n". "\nAuthor: Kjell.Ericson\@haxx.se\n"; for ($i=0; $i<=$#ARGV; $i++) { $arg=$ARGV[$i]; if ($arg =~ s/^-//) { if ($arg =~ m/\?/) { print $help; exit(); } if ($arg =~ m/a/) { $curl_args=$ARGV[++$i]; } if ($arg =~ m/B/) { $base=$ARGV[++$i]; if ($base !~ m/(http:\/\/[^\/]*)/i) { print "***Malformed -B(ase)\n"; die($help); } $base=~ s/([+*~^()\\])/\\$1/g; # escape chars push @basematch, $base; } if ($arg =~ m/I/) { push @ignorepatt, $ARGV[++$i]; } if ($arg =~ m/b/) { $strip_from_file=$ARGV[++$i]; } if ($arg =~ m/c/) { $ignore_cgi=1; } if ($arg =~ m/d/) { $max_deep=$ARGV[++$i]; } if ($arg =~ m/o/) { $dest_dir=$ARGV[++$i]; $dest_dir=~ s/\/$//g; } if ($arg =~ m/t/) { $tmp=$ARGV[++$i]; $tmp=~ s/\/$//g; } if ($arg =~ m/s/) { $max_size=$ARGV[++$i]; } if ($arg =~ m/i/) { $default_name=$ARGV[++$i]; } if ($arg =~ m/l/) { $only_html=1; } if ($arg =~ m/v/) { $verbose=1; } if ($arg =~ m/p/) { $picture_load=1; } if ($arg =~ m/f/) { $flat=1; } if ($arg =~ m/F/) { $flat=2; } } else { #default $start=$arg; } } $curl="curl -s $curl_args "; if ($base eq "") { if ($start !~ m/(http:\/\/.+\/)/i) { if ($start =~ m/(http:\/\/.+)/i) { $start.="/"; } else { print "***Malformed start URL ($start)\n"; die($help); } } $base=$start; $base=~ s/\/[^\/]+$/\//; # strip docname $base=~ s/([+*~^()\\])/\\$1/g; # escape chars $basematch[0]=$base; } $follow_link{"start"}=0; $linktmp="[ \n\r]*=[ \r\n]*)([\"'][^\"']*[\"']|[^ )>]*)"; %follow=( "(<[^>]*a[^>]+href$linktmp", "link", "(<[^>]*area[^>]+href$linktmp", "link", "(<[^>]*frame[^>]+src$linktmp", "link", ); if ($only_html == 0) { %follow=(%follow, "(BODY[^>]*\{[^}>]*background-image:[^>}]*url[(])([^\}>\) ]+)", "img", # for stylesheets "(<[^>]*img[^>]+src$linktmp", "img", "(<[^>]*body[^>]+background$linktmp", "img", "(<[^>]*applet[^>]+archive$linktmp", "archive", "(<[^>]*td[^>]+background$linktmp", "img", "(<[^>]*tr[^>]+background$linktmp", "img", "(<[^>]*table[^>]+background$linktmp", "img", ); } $deep=0; $found=1; while ($found && $deep<$max_deep) { $found=0; foreach $url (keys %follow_link) { $current_depth=$follow_link{$url}; # print STDERR ">$url $current_depth\n"; if ($current_depth == $deep && $current_depth>=0 && $total_size<$max_size*1024*1024) { $found=1; $current_depth++; if ($url eq "start") { delete $follow_link{$url}; $url=$start; $url="stdin" if ($url eq ""); $start=""; } $follow_link{$url}=-1; $stop=0; $status_code=0; $content_type=""; $real_url=$url; $real_url=~s/#(.*)//; # strip bookmarks before loading if ( $url !~ m/[ \n\r]/) { $filecounter++; $this_file_name="$filecounter..$real_url"; $this_file_name =~ s/%([a-fA-F0-9][a-fA-F0-9])/chr hex $1/eg; #$this_file_name=~ s/[^a-zA-Z0-9.]+/_/g; $this_file_name=~ s/[^\w\d]+/_/g; $content_type=""; print STDERR "Get $deep:$url\n" if ($verbose); $head=`$curl -D - -o "$tmp/$this_file_name" "$real_url"`; $filenames{$real_url}=$this_file_name; if ($head =~ m/Location: *["]?(.*)["]?/i) { $loc=$1; $loc=~ s/[\r\n]//g; $loc=merge_urls($real_url, $loc); if (accept_url($loc) || ($picture_load && $linktype{$real_url} eq "img")) { `rm "$tmp/$this_file_name"`; delete $filenames{$real_url}; $real_url=$loc; $url=$loc; print STDERR "Reget $deep:$url\n" if ($verbose); $head=`$curl -D - -o "$tmp/$this_file_name" "$real_url"`; $filenames{$real_url}=$this_file_name; $follow_link{$real_url}=-1; } } $total_size+=-s "$tmp/$this_file_name"; if ($head =~ m/^HTTP[^\n\r]* ([0-9]+) ([^\n\r]*)/s) { $status_code=$1;#." ".$2; } if ($head =~ m/[\n\r]Content-Type:(.*)[\r\n]/si) { $content_type=$1; } $linktype{$real_url}=$content_type; if ($content_type !~ m/html/i) { if ($only_html) { # remove this file $total_size-=-s "$tmp/$this_file_name"; `rm "$tmp/$this_file_name"`; delete $filenames{$real_url}; } } else { $text=`cat "$tmp/$this_file_name"`; if ($current_depth<$max_deep) { $linktype{$real_url}="html"; $text="" if ($url =~ m/\#/); foreach $search (keys %follow) { while ($text =~ s/$search//si) { $link=$2; $link=~ s/[\"\']//g; $link=~ s/#.*//; $newurl=merge_urls($url, $link); if ($ignore_cgi==0 || $newurl !~ m/\?/) { if (accept_url($newurl) || ($picture_load && $follow{$search} eq "img")) { if (!exists $follow_link{$newurl}) { if ($only_html == 0 || $newurl !~ m/\.($nonhtmlfiles)$/i) { $follow_link{$newurl}=$current_depth; } } } } } } } } } } } $deep++; } print STDERR "Max size exceeded ($total_size bytes)!\n" if ($total_size>=$max_size*1024*1024); print STDERR "Total size loaded:$total_size bytes\n" if ($verbose); foreach $url (keys %filenames) { local $destname=$url; $destname=~ s/$basematch[0]//; local $destdir=$destname; $destdir="" if ($destdir !~ m/\//); $destdir =~ s/\/[^\/]*$/\//; $destname=~ s/^.*\///g; $destname=~s/#(.*)//; local $bookmark=$1; $destname=~ s/[^a-zA-Z0-9.]/_/g; # strip chars we don't want in a filename $destdir=~ s/$strip_from_file// if ($strip_from_file ne ""); $destdir=~ s/^([^\/]+):\/\//$1_/; $destdir=~ s/[^a-zA-Z0-9.\/_]/_/g; $destdir=~ s/(^\/)|(\/$)//g; # strip trailing/leading slashes if ($flat) { if ($flat==2) { $destdir=~ s/[\/:]/_/g; $destdir.="_"; } else { $destdir=""; } `mkdir -p "$dest_dir"`; } else { local $tmp="$dest_dir/$destdir"; $tmp=~ s/^\///g; `mkdir -p "$tmp"`; $destdir.="/" if ($destdir ne ""); } $destname=$default_name if ($destname eq ""); $destname=$destdir.$destname if ($destdir ne ""); if (($linktype{$url} =~ m/html/i) && ($destname !~ m/\.[s]?htm/i)) { $destname.=".html"; } $destfile{$url}=$destname; } foreach $url (keys %filenames) { $name=$filenames{$url}; $destname=$destfile{$url}; if ($linktype{$url} !~ m/html/) { `mv "$tmp/$name" "$dest_dir/$destname"`; } else { $text=`cat "$tmp/$name"`; foreach $search (keys %follow) { $text=~ s/$search/"$1\"".make_file_relative($url,merge_urls($url, $2))."\""/sgie; } if (open(OUT, ">$dest_dir/$destname")) { print OUT $text; close(OUT); } else { print STDERR "Couldn't save file '$dest_dir/$destname'\n"; } `rm "$tmp/$name"`; } } # Input: Base-URL, MakeRelative-URL # # Function: Convert and return "MakeRelativ-URL" to be relative # to "Base-URL". # sub make_file_relative { local ($from, $to)=@_; local $result=""; local $sourcename=$destfile{$from}; local $destname; local $bookmark; if ($to=~ s/(\#.*)$//) { # extract bookmarks $bookmark=$1; } $destname=$destfile{$to}; if ($destname eq "") { return $to.$bookmark } $sourcename="" if ($sourcename !~ m/\//); $sourcename=~ s/\/[^\/]*$/\//; #strip filename do { $sourcename=~ m/^([^\/]*\/)/; local $dir=$1; if ($dir ne "") { $dir=~ s/([*.\\\/\[\]()+|])/\\$1/g; if ($destname =~ s/^$dir//) { $sourcename=~ s/^$dir//; } else { $dir=""; } } } while ($dir ne ""); $sourcename=~ s/[^\/]+\//..\//g; # Relative it with some ../ $result="$sourcename$destname"; $result=~ s/^\///g; return $result.$bookmark; } # Function: If you are viewing location "$base" which is a full URL, and # click on "$new" that can be full or relative - where do you get? That # is what this function returns. # # Input: base-URL, new-URL (where to go) # Returns: a full format new-URL (without bookmark) # sub merge_urls { local ($org, $new)=@_; local $url, $new; $new =~ s/[\"\']//g; if ($new =~ m/.*:/) { $url=$new; } elsif ($new eq "") { $url=$org; } else { if ($org =~ m/(.*):\/\/([^\/]*)(.*)$/) { local $prot=$1; local $server=$2; local $pathanddoc=$3; local $path; local $doc=$3; if ($pathanddoc=~ m/^(.*)\/(.*)$/) { $path=$1; $doc=$2; } $doc=~s/#(.*)//; local $bookmark=$1; if ($new =~ m/^#/) { $url="$prot://$server$path/$doc$new"; } elsif ($new =~ m/^\//) { $url="$prot://$server$new"; } else { $url="$prot://$server$path/$new"; while ($url =~ s/\/[^\/]*\/\.\.\//\//){} while ($url =~ s/\.\///){}; } } } return $url; } sub accept_url { local ($url)=@_; local $ret=0; print "test $url\n"; for (@basematch) { if ($url =~ m/$_/) { $ret=1; } } return 0 if ($ret == 0); # No basematch for (@ignorepatt) { if ($url =~ m/$_/) { return 0; } } print "match $url\n"; return 1; }