#!/usr/bin/perl # I built this script to prove to Strother that I'm acutally learning something # with this Perl stuff. The goal of this file is to go out and download all of the # HTML files associated the O'Rielly book "Learning The Korn Shell" from a particular site. # We'll be using cURL, which gathers things after the webserver has # pushed them to the client. So, we can't exactly poke through all of their # directories, but since the file names are standardized, and the hyperlinks # are relative, we can just cycle through and download everything we need. # Relatively referenced images are not pulled in right now, so if you see # any, there're explicitly referenced and pulling from the webserver # on which they actually reside, not your machine. # You should run this file in the directory you want to drop the files in. # In my case, it's buried a few folders deep in a directory called /ksh. # To execute it, you'll want to "chmod a+x getksh" so you can run it by # simply typing ./getksh. Here we go... # Make the subdirectory for the index files that we'll download later in the script. `mkdir index/`; # Save some time by putting the main url directory into a variable for later use. my $url_root = "http://docstore.mik.ua/orelly/unix2.1/ksh/"; # Create a string to get the directory index file in the proper curl syntax. $get_index = "curl " . $url_root . "index.htm -o index.htm"; # Execute that string. `$get_index`; # Now we start getting the chapter files, starting with chapter 1 page 1. my $ch = "1"; my $pg = "1"; # Begin the loop through all 10 chapters. foreach $ch (1..10) { # Put a zero in front if the chapter is less than 10. # In our case, we KNOW it will be less than 10 for all but 1 # of the values (chapter 10), but this is far less code # and ensures that the string keeps with the naming convention # of the files on the webserver. if ($ch < 10) { $ch = "0". $ch; } # Do the same (within a nested foreach) for the pages. Again, in this case, # we KNOW that ALL of the pages will be less than 9, but that's ok. # A quick manual browse of the files shows that no chapter is longer # than 9 pages, so we'll cap the value there and if there are less, # cURL and the webserver will handle the error parsing on our behalf # by downloading whatever 404 record the server sends back, which is # fine because the relative link in the pagenation of the HTML should # never reference the irrelevant file. foreach $pg (1..9) { # Same as before, adding a zero to the front of the page number if necessary. if ($pg < 10) { $pg = "0". $pg; } # Create a variable for the full path to the current file based upon # the standard naming convention used by the webserver. my $current_file = $url_root . "ch" . $ch . "_" . $pg . ".htm"; # Use that variable to plug into the curl syntax and write it # to the current directory of our local machine with the same relative # filename, but without the url root. my $execute_string = "curl " . $current_file . " -o ch" . $ch . "_" . $pg . ".htm\n"; # Execute that string... `$execute_string`; # ...and loop by move to the next page... my $pg++; } # ...and the next chapter. my $ch++; } # Create some variables for the naming convention of the non-chapter # files; specifically, the preface, the index (which will live in the # subdirectory that we created way up at the top of this script), and the # appendix. $preface_path = "prf1"; $index_path = "index/idx_"; $appendix_path = "app"; # The index naming convention is "index/idx_X" where X = 0, then a..z, # so I put 0 in @index_string[0], then push the alphabet after that. # On a related note, if you invoke "use strict;" at the beginning of the # script, you wouldn'tbe able to push alpha in a range like a..z like I # did here, which is why I left it off. @index_string = (0); push (@index_string, a..z); # The appendix naming convention is appX where X = a..c. # so I push the a..c into it. Technically I don't have to # define an empty array before filling it, but I did it anyway # because the 'good ettiquite' varies depending on which book you read. @appendix_string = ( ); push (@appendix_string, a..c); # This one actaully pretty stupid, because there is only 1 preface chapter # so I just create an array with 1 value in it, but I could define it as # a scalar variable with a $ instead of a @ if I wanted to. In my case, # I wanted to use the same basic structure for the subs I run later. @preface_string = ( ); push @preface_string, 1; # I could have cycled through the data for the index, appendix, and preface # the same way I did for the chapters, but I wanted to prove I knew how to use # subs as well. It also makes for a bit less code overall. # This is the sub for cycling through the index files # and pushing the subsequent scalar variables into an array. # This sub is a bit smaller than the rest because the index files # are all 1 page each. sub index_cycle { foreach(@_) { $current_aux_file = "curl " . $url_root . $index_path . $_ . ".htm -o " . $index_path . $_ . ".htm\n"; push @index_return, $current_aux_file; } @index_return; } # And then I call the routine above, sending the index array. @index_files = &index_cycle(@index_string); # And execute the resulting values in the array until the end. `@index_return`; # In this next sub, I do the same for the appendix, but this time I include # a section for the pages of each chapter by way of an additional foreach. # There are no more than 8 in any given appenix chapter for this directory. sub appendix_cycle { foreach(@_) { my $pg = 0; foreach $pg (1..8) { if ($pg < 10) { $pg = "0". $pg; } $current_aux_file = "curl " . $url_root . $appendix_path . $_ . "_" . $pg . ".htm -o " . $appendix_path . $_ . "_". $pg . ".htm\n"; push @appendix_return, $current_aux_file; $pg++; } } @appendix_return; } # Call the appendix routine. @appendix_files = &appendix_cycle(@appendix_string); # And execute the array value by value again. `@appendix_return`; # From here on down, it's basically the same deal as the apendix cycle, # this time for the preface chapter, of which there is only 1. The nice thing # about the sub is that it would still work even if there were several # preface chapters. sub preface_cycle { foreach(@_) { my $pg = 0; foreach $pg (1..8) { if ($pg < 10) { $pg = "0". $pg; } $current_aux_file = "curl " . $url_root . $preface_path . "_" . $pg . ".htm -o " . $preface_path . "_". $pg . ".htm\n"; push @preface_return, $current_aux_file; $pg++; } } @preface_return; } @preface_files = &preface_cycle(@preface_string); `@preface_return`; # So, what did we learn here? I hope we learned that Adam's not a complete idiot, # and that, because of efficient use of code and sub-routines, we could easly tweak # this script to get many of the other books by adjusting the $url_root. Also, I hope # that we even taught the reader of these comments about the cURL library. # Speaking of the cURL library, if you're running this on a Windows machine, through # AcitvePerl, for example, you'll need 4 additional files in the same directory. Those are # curl.exe, libcurl.dll, libeay32.dll, and libssl32.dll. That sounds complicated, but you can # get them from http://curl.haxx.se for pretty much any platform. On most Linux distros, # cURL will be avalialbe to you out-of-the-box.