# Richard A. DeVenezia
# October 6, 2003
# http://www.devenezia.com
# Improve SAS Online help - tested with version 8 and 9
# Browse extractDir\index.html after running the script
my %param = ( modules => "common af fsp"
, extractDir => "c:\\temp\\sas"
, noiseLevel => 9
, pageLimit => 0
);
# This perl script was tested on a Windows 2000 machine
# perl for windows can be downloaded from http://www.activestate.com
# The source was edited using UltraEdit found at http://www.ultraedit.com
#
# I agree this script may contain stupid perl
#
# Runtime parameters
# -----
# modules - space separated list of SAS help chm modules,
# full list can be seen at !SASROOT\core\help
# extractDir - local path where chm help modules get decompiled
# noiseLevel - higher means more messages
# pageLimit - 0 means process all files, otherwise process only
# first N files of each module (only use N>0 when testing)
#
# module common is needed for style sheets
# the program adds an A{} block to make links more visible
#
#-------------------------------------
# What does the script do ?
#-------------------------------------
#
# Modify html files extracted from SAS chm files:
# - color links that refer to pages that refer back
# - link to __ALL__ pages that are referers
# - list dead links found on a page
# - indicate if the page is an orphan
# Generate an index that:
# - links to table of contents
# - links to list of keywords
# - lists orphan pages
# - lists pages with dead links
# - lists pages with duplicate titles
#
# Requires:
# Access to registry to determine SAS installation location
# Microsoft Html Help (hh.exe) so that .chm files can be decompiled.
# Following a link into an existing chm requires:
# Internet Explorer with JavaScript enabled
#
#-------------------------------------
# Background
#-------------------------------------
#
# The SAS Online documentation is quite complete and very informative.
# That does not mean it can not be improved. One area I find needing
# improvement is back links. Often a keyword search will take me to a
# page that does not have information to allow it to be 'located' in
# the contents tree when the location button is pressed. Nor is there
# a link to another page having 'parent' or 'aggregating' context.
#
# This is especially troublesome for AF programmers whose search places
# them at a methods or attributes page. These pages do not have back
# links to the class containing the method (ouch!).
#
# I would prefer each page have to link to _every_ page that links to it.
# Doing so provides a much richer information net and lets me get a
# taste of the oosphere.
#
# So, I am addressing the situation
# A ---> B ( A is a referer of B )
# by altering B so the relations are
# A <--> B ( force B to be a referer of A )
#
# An even better improvement (not being done by this program) would be to
# ensure the forced back link goes to the point in A where B is first referred to
#
# A more difficult yet equally useful navigation change would be to
# enable some form of horizontal travesal. (Some sections of SAS help do
# exhibit this feature.)
#
# Consider:
#
# A level 1 A
# /|\ /|\
# / | \ / | \
# B C D level 2 B--C--D
#
# I prefer all nodes on level 2 have links to every other node on level 2.
# At a minimum each node should provide a previous and next. In terms of
# SAS/AF, it would mean when you are looking at method page, you are one
# or two clicks away from another classes method or attribute page.
# Anyway, that is for a later day...
#
# I studied the html files decompiled out of af and fsp chm and found
# several things:
#
# 1. very good consistency
# 2. consistency means simplistic pattern matching and replacement can
# be used to extract information and manipulate the html files to my
# own purposes.
#
#-------------------------------------
# What are the patterns ?
#-------------------------------------
#
# All link navigation is of form information.
# destination is of form MS-ITS:.chm::/.hlp/.
# The decompiled html files are placed in a .hlp subfolder.
# Image SRC refer to a root absolute /.hlp/images/ instead of
# relative ../.hlp/images
#
# With such good consistency we can
#
# 1. make changes to HREFs so online help works in decompiled form
# 1a. some advanced mojo is used to change links to modules _not_ decompiled.
# the links are changed to cause htmlhelp to open when the link is clicked.
# the mojo only works in Internet Explorer browser.
# 2. determine incoming and outgoing links of each page for processing
# I.E.
# - if a page P has links incoming from A,B,C,X,Y and has outgoing links to B,C,X,Y,Z
# I want to add to page P outgoing links to A and colorized the outgoing links
# B,C,X,Y
#
#-------------------------------------
# How is the link data processed ?
#-------------------------------------
#
# Regular Expressions and Hashes!!!
#
# Each file in the .hlp folders is scanned and information extracted.
# At the same time, 'fixes' are made to the links so they work in decompiled form.
#
# There will be three conceptual hashes maintained
# - pages - hash for page data, each file scanned has 'page data'
# o page data - an array
# - incoming, hash for page referers
# - outgoing, hash for page href destinations
# - title of page
#
# The data requires to passes
# pass 1. fix necessary links and record linkages
# pass 2. analyze linkages and update pages if necessary
#
# Once the data is in hashes, it is a relatively simple matter to
# perform all the interesting set analysis we want.
#
#-------------------------------------
# How big is this stuff ?
#-------------------------------------
# common, af and fsp ends up with ~7,500 files (36mb)
# and requires about 100 seconds to process when run on a
# Windows 2000 / Intel 3.06gHz / 1g ram / ata-100 system
#
# I have not tried recompiling the modified html back into
# chm files.
use strict;
use Time::HiRes qw(gettimeofday);
use Win32::TieRegistry;
use DirHandle;
#----------------------------------------------------------------------
#---- move parameters into variables
my ($modules,$extractDir,$noiseLevel,$pageLimit) =
@param{qw/modules extractDir noiseLevel pageLimit/};
if ( ! defined $noiseLevel ) { $noiseLevel = 0; }
if ( ! defined $pageLimit ) { $pageLimit = 0; }
if ( $modules !~ /\bcommon\b/ ) {
die "Call me stubborn, no running without common module." ;
}
#---- check if hh.exe will run
my $rc = system ( "hh.exe -decompile foo bar" );
if ( $rc ) {
die "Problem running HtmlHelp decompiler.\n";
}
#---- determine location of SAS installation
my $sasRoot;
my $saskey = "HKEY_LOCAL_MACHINE\\SOFTWARE\\SAS Institute Inc.\\The SAS System\\";
my $keyname = "${saskey}CurrentVersion\\CurrentVersion";
my $curver = $Registry->{$keyname};
if (! $curver) {
die "Problem retrieving SAS current version from registry.\n";
}
$keyname = "${saskey}${curver}\\DefaultRoot";
$sasRoot = $Registry->{$keyname};
if (! $sasRoot) {
die "Problem retrieving SAS root from registry.\n";
}
undef $Registry;
#---- is extract folder is available ?
my $chmRoot = "${sasRoot}\\core\\help";
if ( ! -d $extractDir ) {
die "Folder ${extractDir} does not exist, and I won't make it for you." ;
}
my $helpFolder = actualFolder ( $chmRoot );
my $htmlFolder = actualFolder ( $extractDir );
#----
my @modules = split / / , $modules;
$modules = join ("|", @modules); # rx alternation
my $folder = $htmlFolder;
my $rxfolder = $folder;
$rxfolder =~ s/\\/\\\\/g; # folder for use in rx patterns
my $helpFldr = $helpFolder;
$helpFldr =~ s#\\#/#g; # helpFolder for use in javascript:showHelp
#---- for reporting
my $readCount = 0;
my $writeCount = 0;
my $rereadCount = 0;
my $tweakCount = 0;
my $detweakCount = 0;
#----
my %pages; # one array of page data per html file
my %titles; # one array of pages for each page title;
#---- constants for page data array
my $_INCOMING = 0;
my $_OUTGOING = 1;
my $_TITLE = 2;
my $_ISTWEAKED = 3;
my $_SHOWHELP = 4;
#---- array for recording timing splits
my @timeMark;
tmark(1);
decompile();
translate_hhc();
translate_hhk();
fix_hyperlinks();
hash_titles();
report_incoming();
create_root_index();
report_files_read();
create_link_colorizer();
tweak_common_css();
install_back_links();
#------- The End
tsplit(1,'magic');
print "\n\nmodules: " . join(" ", @modules)
. "\nadjust read: " . $readCount
. "\nadjust write: " . $writeCount
. "\nimprove read: " . $rereadCount
. "\nimprove write: " . $tweakCount
. "\ndetweak write: " . $detweakCount
;
exit 0;
#----------------------------------------------------------------------
#----------------------------------------------------------------------
#----------------------------------------------------------------------
sub tmark {
my ($n) = @_;
$timeMark [ $n ] = gettimeofday;
}
#----------------------------------------------------------------------
sub tsplit {
my ($n, $label) = @_;
my $t = gettimeofday;
my $e = $t - $timeMark[ $n ] ;
$timeMark[$n] = $t;
if ($n<=$noiseLevel && defined($label)) {
print "\n[" . " "x(2*$n-1) . "$label: ${e}s ]";
}
return $e;
}
#----------------------------------------------------------------------
sub actualFolder {
my ($folder) = @_;
return "$folder\\";
}
#----------------------------------------------------------------------
sub readFile {
my ($file, $content_ref) = @_;
local(*INPUT, $/);
open (INPUT, $file) || die "can't open $file: $!";
$$content_ref = ;
}
#----------------------------------------------------------------------
sub writeFile {
my ($file, $content_ref) = @_;
local(*OUTPUT, $/);
open (OUTPUT, ">${file}") || die "can't open $file: $!";
print OUTPUT $$content_ref;
}
#----------------------------------------------------------------------
sub addToHtml {
my ($file, $snippet) = @_;
my ($gulp, $burp);
readFile ( $file, \$gulp );
$rereadCount++;
print "\n"," "x10,"read $file(".length($gulp).")" if $noiseLevel>=99;
$burp = $gulp;
$burp =~ s{\n?.*?\n?}{}gos;
$burp =~ s{(