#!/usr/bin/perl

# I built this script to prove to Strother that I'm acutally learning something
# with this Perl stuff. The goal of this file is to go out and download all of the
# HTML files associated the O'Rielly book "Learning The Korn Shell" from a particular site.

# We'll be using cURL, which gathers things after the webserver has
# pushed them to the client. So, we can't exactly poke through all of their
# directories, but since the file names are standardized, and the hyperlinks
# are relative, we can just cycle through and download everything we need.
# Relatively referenced images are not pulled in right now, so if you see
# any, there're explicitly referenced and pulling from the webserver
# on which they actually reside, not your machine.

# You should run this file in the directory you want to drop the files in.
# In my case, it's buried a few folders deep in a directory called /ksh.
# To execute it, you'll want to "chmod a+x getksh" so you can run it by
# simply typing ./getksh. Here we go...

# Make the subdirectory for the index files that we'll download later in the script.
`mkdir index/`;

# Save some time by putting the main url directory into a variable for later use.
my $url_root = "http://docstore.mik.ua/orelly/unix2.1/ksh/";

# Create a string to get the directory index file in the proper curl syntax.
$get_index = "curl " . $url_root . "index.htm -o index.htm";
# Execute that string.
`$get_index`;

# Now we start getting the chapter files, starting with chapter 1 page 1.
my $ch = "1";
my $pg = "1";

# Begin the loop through all 10 chapters.
foreach $ch (1..10) {

    # Put a zero in front if the chapter is less than 10.
    # In our case, we KNOW it will be less than 10 for all but 1
    # of the values (chapter 10), but this is far less code
    # and ensures that the string keeps with the naming convention
    # of the files on the webserver.
    if ($ch < 10) {
       $ch = "0". $ch;
    }

    # Do the same (within a nested foreach) for the pages. Again, in this case,
    # we KNOW that ALL of the pages will be less than 9, but that's ok.
    # A quick manual browse of the files shows that no chapter is longer
    # than 9 pages, so we'll cap the value there and if there are less,
    # cURL and the webserver will handle the error parsing on our behalf
    # by downloading whatever 404 record the server sends back, which is
    # fine because the relative link in the pagenation of the HTML should
    # never reference the irrelevant file.
    foreach $pg (1..9) {

        # Same as before, adding a zero to the front of the page number if necessary.
        if ($pg < 10) {
            $pg = "0". $pg;
        }
        
        # Create a variable for the full path to the current file based upon
        # the standard naming convention used by the webserver.
        my $current_file = $url_root . "ch" . $ch . "_" . $pg . ".htm";

        # Use that variable to plug into the curl syntax and write it
        # to the current directory of our local machine with the same relative
        # filename, but without the url root.
        my $execute_string = "curl " . $current_file . " -o ch" . $ch . "_" . $pg . ".htm\n";
        
        # Execute that string...
        `$execute_string`;

        # ...and loop by move to the next page...
        my $pg++;

    }

    # ...and the next chapter.
    my $ch++;
}

# Create some variables for the naming convention of the non-chapter
# files; specifically, the preface, the index (which will live in the
# subdirectory that we created way up at the top of this script), and the
# appendix.
$preface_path = "prf1";
$index_path = "index/idx_";
$appendix_path = "app";

# The index naming convention is "index/idx_X" where X = 0, then a..z,
# so I put 0 in @index_string[0], then push the alphabet after that.
# On a related note, if you invoke "use strict;" at the beginning of the
# script, you wouldn'tbe able to push alpha in a range like a..z like I
# did here, which is why I left it off.
@index_string = (0);
push (@index_string, a..z);

# The appendix naming convention is appX where X = a..c.
# so I push the a..c into it. Technically I don't have to
# define an empty array before filling it, but I did it anyway
# because the 'good ettiquite' varies depending on which book you read.
@appendix_string = ( );
push (@appendix_string, a..c);

# This one actaully pretty stupid, because there is only 1 preface chapter
# so I just create an array with 1 value in it, but I could define it as
# a scalar variable with a $ instead of a @ if I wanted to. In my case,
# I wanted to use the same basic structure for the subs I run later.
@preface_string = ( );
push @preface_string, 1;

# I could have cycled through the data for the index, appendix, and preface
# the same way I did for the chapters, but I wanted to prove I knew how to use
# subs as well. It also makes for a bit less code overall.

# This is the sub for cycling through the index files
# and pushing the subsequent scalar variables into an array.
# This sub is a bit smaller than the rest because the index files
# are all 1 page each.
sub index_cycle {
    foreach(@_) {
        $current_aux_file = "curl " . $url_root . $index_path . $_ . ".htm -o " . $index_path . $_ . ".htm\n";
        push @index_return, $current_aux_file;
    }
    @index_return;
}

# And then I call the routine above, sending the index array.
@index_files = &index_cycle(@index_string);
# And execute the resulting values in the array until the end.
`@index_return`;

# In this next sub, I do the same for the appendix, but this time I include
# a section for the pages of each chapter by way of an additional foreach.
# There are no more than 8 in any given appenix chapter for this directory.
sub appendix_cycle {
        foreach(@_) {
            my $pg = 0;

            foreach $pg (1..8) {
                if ($pg < 10) {
                    $pg = "0". $pg;
                }
                $current_aux_file = "curl " . $url_root . $appendix_path . $_ . "_" . $pg . ".htm -o " . $appendix_path . $_ . "_". $pg . ".htm\n";
                push @appendix_return, $current_aux_file;
                $pg++;
            }
        }
    @appendix_return;
}

# Call the appendix routine.
@appendix_files = &appendix_cycle(@appendix_string);
# And execute the array value by value again.
`@appendix_return`;

# From here on down, it's basically the same deal as the apendix cycle,
# this time for the preface chapter, of which there is only 1. The nice thing
# about the sub is that it would still work even if there were several
# preface chapters.
sub preface_cycle {
        foreach(@_) {
            my $pg = 0;

            foreach $pg (1..8) {
                if ($pg < 10) {
                    $pg = "0". $pg;
                }
                $current_aux_file = "curl " . $url_root . $preface_path . "_" . $pg . ".htm -o " . $preface_path . "_". $pg . ".htm\n";
                push @preface_return, $current_aux_file;
                $pg++;
            }
        }
    @preface_return;
}

@preface_files = &preface_cycle(@preface_string);
`@preface_return`;

# So, what did we learn here? I hope we learned that Adam's not a complete idiot,
# and that, because of efficient use of code and sub-routines, we could easly tweak
# this script to get many of the other books by adjusting the $url_root. Also, I hope
# that we even taught the reader of these comments about the cURL library.

# Speaking of the cURL library, if you're running this on a Windows machine, through
# AcitvePerl, for example, you'll need 4 additional files in the same directory. Those are
# curl.exe, libcurl.dll, libeay32.dll, and libssl32.dll. That sounds complicated, but you can
# get them from http://curl.haxx.se for pretty much any platform. On most Linux distros, 
# cURL will be avalialbe to you out-of-the-box.