#! /usr/bin/perl # # Program: log_profile.pl # $version = "6.0 02Dec1999"; # By: Kenneth J. Lanfear, U.S. Geological Survey (lanfear@usgs.gov) # # Versions 2-4 incorporated significant contributions by # Graig McHendrie and suggestions by Henry A. Flogel. # Version 5.0 is a radical restructuring of the program to allow scaling # for larger access_log files. # Version 5.1 adds the capability to analyze the activity of robots. # Version 5.2 shows page hits by visits and users. # Version 5.3 has a new and faster log parser. Code has been tuned for speed. # Also, a bug was repaired in Version 5.2's page count. # Version 5.4 is tolerant of logs that don't have the offset specified. # Version 5.4.1 makes minor changes so that the coding is compatable # with Perl5 in a Windows environment. # Version 5.4.1.1 adds Google to default refererrers # Version 6.0 Completely revised time computations, with time based # on the earliest time, GMT. Also saves user history as hash of arrays # of arrays -- a more complex structure, but faster to process. # Version 6.0 is much more careful about designating local # variables in subroutines. Global variables intended to be exchanged # among routines now begin with capital letters to distinguish # them from local variables. # # INSTALLATION # The log_profile.txt file must be installed to show usage. # See installation variables in the section below. # # ======================================================================= # DISCLAIMER # # Although this program has been used by the USGS, no warranty, expressed # or implied, is made by the USGS or the United States Government as to the # accuracy and functioning of the program and related program material, nor # shall the fact of distribution constitute any such warranty, and no # responsibility is assumed by the USG in connection therewith. # # ======================================================================= # Invoke the library. use Time::Local; #Installation-dependent variables. $Maxusers = 20000; #This is the maximum number of users that will be # processed at once. After that, the program saves them on temporary # files. This number should be as large as possible for the best speed. # However, if you run out of memory, make it smaller. $Templog_Base = "log_profile.tmp.$$."; #This is the name template for # temporary files. $UsageFilePath = '/www/log_profile/program/log_profile.txt'; #Pathname for the usage file. #Default run controls. Some can be changed in the "rc" file. # If you change these here, you may want to change the information # in the log_profile.txt file. $Rpt = 20000; #As it reads the log, the program reports at this interval. $Def_Topcount = 40; #Reports on the top $Def_Topcount pages. $VisitLength = 1800; #30-minute visit $MinHits = 1000; #Minimum number of hits for considering a page. $RobotLimit = 19; #Robots must hit more than this many pages to be detected. $BGColor = '#ffffff'; $LastOffset = '-0400'; #Default offset from GMT. #Set defaults, part 1 $SIG{'INT'} = 'interrupt'; $SIG{'QUIT'} = 'interrupt'; $SIG{'TERM'} = 'interrupt'; $SIG{'HUP'} = 'interrupt'; #Names for printing user categories $ULong{'all'} = 'All Users '; $ULong{'inhouse'} = 'In-house '; $ULong{'outside'} = 'Outside '; $ULong{'gov'} = ' Government '; $ULong{'edu'} = ' Education '; $ULong{'com'} = ' Commercial '; $ULong{'majornet'} = ' Major Networks '; $ULong{'foreign'} = ' Foreign '; $ULong{'other'} = ' Other '; #Classes of pages $PageTypeClass{'html'} = 'Hypertext Markup Language'; $PageTypeClass{'shtml'} = 'Server-side Includes'; $PageTypeClass{'css'} = 'Cascading Style Sheets'; $PageTypeClass{'ps'} = 'PostScript'; $PageTypeClass{'pdf'} = 'Page Description Format'; $PageTypeClass{'image'} = 'Images'; $PageTypeClass{'map'} = 'Image Maps'; $PageTypeClass{'cgi-bin'} = 'Scripts'; $PageTypeClass{'arcexp'} = 'Arc/INFO Export Files'; $PageTypeClass{'sdts'} = 'Spatial Data Transfer Standard (SDTS)'; $PageTypeClass{'other'} = 'Other'; @PageTypeClassOrder = ('html', 'shtml','css','ps','pdf','image','map', 'cgi-bin','arcexp','sdts','other'); %AlphaMonthToDeciMonth = ( jan => 1, feb => 2, mar => 3, apr => 4, may => 5, jun => 6, jul => 7, aug => 8, sep => 9, oct => 10, nov => 11, dec => 12 ); unless ($ARGV[0]) {&usage_die} #Check for usage &readoptions; # Process command line options &proc_rc; # Process the runtime file &readcommandline; # Read the rest of the command line #Set remaining defaults unless ($Tc_tp) {$Tc_tp = $Def_Topcount} unless ($Tc_ip) {$Tc_ip = $Def_Topcount} unless ($Tc_fr) {$Tc_fr = $Def_Topcount} unless ($Tc_rf) {$Tc_rf = $Def_Topcount} unless ($Tc_agents) {$Tc_agents = $Def_Topcount} unless ($Userfile_Page) {$Userfile_Page = $N_Page_Alias} if ($Default_Domain =~ /^usgs\.gov$/i) {$USGSonly_flag = 1} #Open some files if ($UserFile) { open (USERLIST,">$UserFile"); unless ($opt_s_) {print "User file will be $UserFile\n"} } if ($pagefile) { open (PAGELIST,">$PageFile"); unless ($opt_s_) {print "Page file will be $PageFile\n"} } if ($RobotFile) { open (ROBOTLOG,">$RobotFile"); unless ($opt_s_) {print "Robot file will be $RobotFile\n"} } if ($ReferFile) { unless ($opt_s_) {print "Refer file will be $ReferFile\n"} } #Compute starting and ending times unless (&beginendtime) { print "Starting and ending dates are wrong.\n"; print "$DateBegin, $Time_Begin\n"; print "$DateEnd, $Time_End\n"; &usage_die; } #Print a welcome message unless ($opt_s) { print "Program log_profile.pl, version $version is analyzing your log files.\n"; print "Obtaining run-time control information from $rc_file\n"; print "Analysis is for $Server and time period of $DateBegin to $DateEnd.\n"; print "A progress report will be given every ",$Rpt," entries.\n"; # print "$rc_rules"; } #Read and process each line of log files. foreach $path (@input_log) { if ($Multilog && $path =~ /([^\/]+)$/) {$LogFileName = $1} &openlog($path); while () { chomp; $record = $_; ++$Count_Logentries; ++$BlockCount; # last if ($Count_Logentries > 100); #For testing last if ($CutLog); #Cuts off reading when past end time if ($BlockCount >= $Rpt) {&show_progress} #Status report ($user,$time,$page,$refer,$agent,$bytes) = &parserecord($record); next unless ($user); #User is blank if record fails to parse. $Count_Success += 1; $page_number = &pageassign ($page,$bytes); $agent_number = &agentassign ($agent); $refer_number = &referassign ($refer); #We want to construct hit records by user. However, if we # exceed usermax users, we want to save the records # for "excess" users in a file to be read later. unless (&add_logrec($user,$time,$page_number,$agent_number,$refer_number)) { &save_logrec($user,$time,$page_number,$agent_number,$refer_number); } } &closelog($path); } &clear_pointers; #Clear unneeded pointers #Tabulate user visits print "Tabulating user visits.\n"; &run_userqueue; #Drain the initial record of user hits. while (&read_temp_log) {# Repeat until all users are processed. &run_userqueue; } #Compute the statistics. &stats_summarize; &stats_totals; #Write the report. unless ($opt_s) { print "Starting to write the report.\n"; print "Output will go to: $Path_Report\n"; } open (OUTFILE,"$Path_Report") || die "Can not open output file $Path_Report.\n"; unless ($Count_Success) {die "No records met the criteria.\n"} &show_banner; &show_summary_block; unless ($skip_users) { &show_user_pie; &show_user_days; &show_user_visits; &show_user_other; } unless ($skip_robots) { &show_robots; } unless ($skip_pages) { &show_pages_by_type; &show_page_access; &show_page_access_image; &show_first_access; } if ($PageFile) { &show_page_file; } unless ($Skip_Refers) { &show_referers; } unless ($Skip_Agents) { &show_agents; } unless ($skip_days) { &show_days; } unless ($skip_rules) { &show_rules; } unless ($skip_defs) { &show_defs; } &show_footer; exit; #================================================================== sub add_logrec { #Adds a log entry to the user record. #We create a hash of arrays of arrays. my ($user,$time,$page_number,$agent_number,$refer_number) = (@_); unless (exists $UserRecord{$user}) { $Users_in_process += 1; if ($Users_in_process > $Maxusers) {return 0} } push (@{$UserRecord{$user}},[$time,$page_number,$agent_number,$refer_number]); return 1; } # ======================================================================== sub agentassign { #Records a hit on a agent and returns the agent number. #Agents start at 1 my ($agent) = $_[0]; if ($Skip_Agents) {return 0} unless (exists $AgentNo{$agent}) { $TotAgent += 1; $AgentNo{$agent} = $TotAgent; $AgentName[$TotAgent] = $agent; } return $AgentNo{$agent}; } # ======================================================================== sub aliaspage { #Returns the page's alias my ($page) = $_[0]; my ($n,$test); for ($n=1; $n <= $N_Page_Alias; $n++) { $test = $Page_Alias_Test[$n]; if ($page =~ /$test/) {return $Page_Alias[$n]} } return $page; } # ======================================================================== sub aliasrefer { #Returns the referrer's alias my ($refer) = $_[0]; my ($n,$test); #Hard-wired aliases if ($refer =~ /\/\/[^\/]*\.usgs\.gov/) {return 'USGS'} if ($refer =~ /\/\/[^\/]*\.epa.gov\//) {return 'EPA'} if ($refer =~ /\/\/[^\/]*\.noaa.gov\//) {return 'NOAA'} if ($refer =~ /\/\/[^\/]*yahoo/) {return 'Yahoo'} if ($refer =~ /\/\/[^\/]*infoseek/) {return 'InfoSeek'} if ($refer =~ /\/\/[^\/]*altavista/) {return 'AltaVista'} if ($refer =~ /\/\/[^\/]*lycos/) {return 'Lycos'} if ($refer =~ /\/\/[^\/]*excite/) {return 'Excite'} if ($refer =~ /\/\/[^\/]*google/) {return 'Google'} if ($refer =~ /\/\/[^\/]*www\.50states\.com/) {return '50states'} unless ($N_Refer_Alias) {return $refer} for ($n=1; $n <= $N_Refer_Alias; $n++) { $test = $Refer_Alias_Test[$n]; if ($refer =~ /$test/i) {return $Refer_Alias[$n]} } return $refer; } # ======================================================================== sub check_code { #Checks return code of log entry my ($result) = $_[0]; my ($code); if (scalar @Ok_Codes) { foreach $code (@Ok_Codes) { if ($result == $code) {return 1} } } elsif ($result >= 200 && $result < 400) {return 1} return 0; } # ======================================================================== sub checkpoint { #Prints a progress report at intervals ++$Checkpoint_Count; if ($Checkpoint_Count eq $Rpt) { print " Read $Count_Logentries log records and processed $Count_Success.\n"; $Checkpoint_Count = 0; } } # ======================================================================== sub classpage { #Classifies a page into type my ($page_number) = $_[0]; my ($page) = $PageName[$page_number]; if ($page =~ /\.s?html?$|\/$/i) {return 'html'} #Hypertext Markup Language elsif ($page =~ /\.gif$|\.jpeg$|\.xbm$|\.jpg$|\.tif$/i) {return 'image'} #Images elsif ($page =~ /cgi-bin|\?/) {return 'cgi-bin'} #CGI-BIN scripts elsif ($page =~ /\.css$/i) {return 'css'} #Cascading style sheet elsif ($page =~ /\.pdf$/i) {return 'pdf'} #Page Description Format elsif ($page =~ /\.map$|\.imf$/i) {return 'map'} #Image Maps elsif ($page =~ /\.ps$|\.eps$|\.ps\.gz$|\.eps\.gz$/i) {return 'ps'} #PostScript elsif ($page =~ /\.e00$|\.e00\.gz$/i) {return 'arcexp'} #Arc/INFO export files elsif ($page =~ /\.sdts$|\.sdts\.tgz$/i) {return 'sdts'} #Spatial Data Transfer Standard return 'other'; } # ======================================================================== sub clear_pointers { #Clears unneeded pointers undef (%ReferNo); undef (%AgentNo); undef (%PageNo); return; } # ======================================================================== sub closelog { #Closes a log file my ($path) = $_[0]; my ($command); unless ($opt_s) {print "Closing $path\n"} close (LOGFILE) || die "Could not close $path\n"; if ($WasZipped) { unless ($opt_s) {print "Re-zipping $path\n"} $command = "gzip $path\n"; system ($command); $WasZipped = 0; } } # ======================================================================== sub iscontenttype { #Returns true if page type is not an image or map my ($pagetype) = $_[0]; if ($pagetype ne 'image' && $pagetype ne 'map') {return 1} return 0; } #================================================================== sub isrobot { #Returns true if the history indicates a robot. my ($user) = $_[0]; my ($a); my ($time,$page_number,$agent_number,$refer_number); foreach $a (@{$UserRecord{$user}}) { ($time,$page_number,$agent_number,$refer_number) = @{$a}; if ($PageName[$page_number] =~ /robots\.txt/ && $#{$a} >= $RobotLimit) {return 1} } return 0; } #================================================================== sub openlog { #Opens a log file my ($path) = $_[0]; my ($command); if ($path =~ /\.gz$/) { unless ($opt_s) {print "Unzipping $path\n"} $command = "gunzip $path\n"; system ($command); $WasZipped = 1; $path =~ s/\.gz$//; } unless ($opt_s) {print "Opening $path\n"} open (LOGFILE,$path) || die "Could not open $path\n"; $CutLog = 0; } # ======================================================================== sub pageassign { #Assigns the page number. my ($page,$bytes) = @_; my ($number); $number = $PageNo{$page}; unless ($number) { $TotPage += 1; $number = $TotPage; $PageNo{$page} = $TotPage; $PageName[$TotPage] = $page; $PageSize[$TotPage] = $bytes; } else { #Record the first nonzero byte value unless ($PageSize[$number]) {$PageSize[$number] = $bytes} } return $number; } # ======================================================================== sub parserecord { #Parses a record and returns "true" for accepted records. my ($record) = $_[0]; my ($user,$time,$page,$refer,$agent,$bytes); my ($testrefer); my ($result); unless ($record =~ /(\S+) \- \- \[([^\]]+)\] \S+ (\S+) \S+ (\S+) (\S+) ?(\S*) ?(\S*)/) {return 0} $user = $1; $time = $2; $page = $3; $result = $4; $bytes = $5; $refer = $6; $agent = $7; #Process only records with successful transaction. unless ($result == 200 || &check_code($result)) {return 0} #Find the time $time = &ParseTime($time); unless ($time) {return 0} if ($time > $Time_End) { if ($opt_e) {return 0} else {$CutLog = 1} #Option e cuts off log } unless ($time >= $Time_Begin) {return 0} $time = $time - $Time_Begin; #Check the bytes. Default is 0. if ($bytes =~ /\D/) {$bytes = 0} unless ($user) {return 0}; #Make sure domain is fully qualified, if possible. if (($user !~ /\./ || $user =~ /\.$/) && $Default_Domain) { $user = "$user.$Default_Domain"; $user =~ s/\.\./\./; } if ($N_Domain_Rules && ! &passuser($user)) {return 0} #Get the page $page =~ s/\"//g; #If multiple logs, prepend the log file name if ($Multilog) {$page = $LogFileName . ':' . $page} unless ($PageNo{$page}) { #hits on index pages are seen the same way. $page =~ s/ +$//; if ($page =~ /\/$/) {$page .= 'index.html'} #Strip off the arguments from cgi-bin requests unless ($NoQuestion) { if ($page =~ /^([^\?]+\?)/) {$page = $1} } if ($N_Page_Alias) {$page = &aliaspage($page)} unless (&passpage($page)) {return 0} } #Get the agent and referrer. Default is "Unknown" if ($agent) { if ($agent =~ /:\/\//) { #Some logs switch agent and referrer. $testrefer = $refer; $refer = $agent; $agent = $testrefer; } $agent =~ s/\"//g; $agent =~ s/^(\S+)/\1/; } else {$agent = '-'} if ($refer) { $refer =~ s/\"//g; $refer = &aliasrefer($refer); } else {$refer = '-'} if ($N_Refer_Rules && ! &passrefer($refer)) {return 0} #Successful read. return ($user,$time,$page,$refer,$agent,$bytes); } # ======================================================================== sub passinhouse { #Returns true if this is an inhouse user. my ($user) = $_[0]; my ($n,$test); if ($USGSonly_flag) { #Hard-wire standard USGS domains for speed. if ($user =~ /\.usgs\.gov$/) {return 1} if ($user =~ /\.nbs\.gov$/) {return 1} if ($user =~ /\.nwrc\.gov$/) {return 1} } unless ($N_Inhouse_Rules) {return 0} for ($n=1; $n <= $N_Inhouse_Rules; $n++) { $test = $Inhouse_Test[$n]; if ($user =~ /$test/i) { if ($Inhouse_Action[$n]) { return 1; } else { return 0; } } } return 0; } # ======================================================================== sub passpage { #Returns "true" if we want to process this page. local ($page) = $_[0]; local ($n,$test); if ($PageNo{$page}) {return 1} for ($n=1; $n <= $n_page_rules; $n++) { $test = $page_test[$n]; if ($page =~ /$test/) { if ($page_action[$n]) { return 1; } else { $n_pages_excluded++; return 0; } } } return 1; } # ======================================================================== sub passrefer { #Returns "true" if we want to process this page. my ($refer) = $_[0]; my ($n,$test); if ($ReferNo{$refer}) {return 1} for ($n=1; $n <= $N_Refer_Rules; $n++) { $test = $Refer_Test[$n]; if ($refer =~ /$test/) { if ($Refer_Action[$n]) { return 1; } else { $N_Refer_Excluded++; return 0; } } } return 1; } # ======================================================================== sub passuser { #Returns true if we want to process this user. my ($user) = $_[0]; my ($n,$test); #Bypass domain checking if user already has been checked. if ($UserRecord{$user}) {return 1}; for ($n=1; $n <= $N_Domain_Rules; $n++) { $test = $Domain_Test[$n]; if ($user =~ /$test/i) { if ($Domain_Action[$n]) { return 1; } else { $N_Domains_Excluded++; return 0; } } } return 1; } # ======================================================================== sub read_temp_log { #Reads from the temporary log my ($user,$time,$page_number,$agent_number,$refer_number); unless ($Templog_Write_Open) {return 0} close (TEMPLOG_WRITE); $Templog_Write_Open = 0; $Templog_Read = $Templog_Write; $Users_in_process = 0; open (TEMPLOG_READ,"$Templog_Read") || die "Could not open $Templog_Read\n"; unless ($opt_s) {print "Reading temporary file $Templog_Read.\n"} while () { chomp; ($user,$time,$page_number,$agent_number,$refer_number) = split(/\t/); unless (&add_logrec($user,$time,$page_number,$agent_number,$refer_number)) { &save_logrec($user,$time,$page_number,$agent_number,$refer_number); } } close (TEMPLOG_READ) || die "Could not close $Templog_Read.\n"; system ("rm $Templog_Read"); return 1; } # ======================================================================== sub referassign { #Records a hit on a refer and returns the refer number. my ($refer) = $_[0]; my ($number); if ($Skip_Refers) {return 0} $number = $ReferNo{$refer}; unless ($number) { $TotRefer += 1; $number = $TotRefer; $ReferNo{$refer} = $TotRefer; $ReferName[$TotRefer] = $refer; } return $number; } #================================================================== sub robotwrite { my ($user) = $_[0]; my ($time,$page_number,$agent_number,$refer_number); my ($r); my ($string,$deltatime); foreach $r (@{$UserRecord{$user}}) { ($time,$page_number,$agent_number,$refer_number) = @{$r}; $deltatime = $time - $startvisittime; $string = sprintf ("%s%8d%s%s%s",' ',$deltatime,' ',$PageName[$page_number],"\n"); $robotrecord .= $string; } return 1; } #================================================================== sub robottally { my ($user,$agent_number,$uservisit,$userpage) = @_; $Robot_Count ++; $Robot_Visit += $uservisit; $Robot_Page += $userpage; if ($RobotFile) { print ROBOTLOG "Robot: $user $AgentName[$agent_number]\n"; print ROBOTLOG " Visits = $uservisit Pages = $userpage\n"; print ROBOTLOG "$robotrecord\n\n"; } return; } # ======================================================================== sub run_userqueue { #Tabulates user visits, emptying the userrecord array. my ($user); foreach $user (keys (%UserRecord)) {&usertrack ($user)} undef (%UserRecord); return; } #================================================================== sub save_logrec { #Saves a log record, in condensed form, for further processing local ($user,$time,$page_number,$agent_number,$refer_number) = (@_); unless ($Templog_Write_Open) { ++$Templog_Num; $Templog_Write = $Templog_Base . $Templog_Num; open (TEMPLOG_WRITE,">$Templog_Write") || die "Can not open $Templog_Write for writing."; unless ($opt_s) {print "Opening temporary file $Templog_Write.\n"} $Templog_Write_Open = 1; } print TEMPLOG_WRITE "$user\t$time\t$page_number\t$agent_number\t$refer_number\n"; return; } #================================================================== sub show_progress { #Reports on progress print "Read $Count_Logentries entries and processed $Count_Success.\n"; $BlockCount = 0; return; } #================================================================== sub split_name { #Split up a fully qualified host name # host = first component # domain = last component # subdomain = anything in between my ($name) = $_[0]; my ($host,$domain,$subdomain); my (@subuser) = split(/\./,$name); if ($#subuser < 0) { return ('','','')} # If host by name, not IP: if ($name =~ /[A-Za-z]/) { $host = $subuser[0]; if ($#subuser == 0) {return ($host,'','')} $domain = $subuser[$#subuser]; if ($#subuser != 1) { for ($n=1; $n < $#subuser; $n++) {$subdomain .= "$subuser[$n]."} $subdomain =~ s/\.$//; } } # IP number: else { $host = $subuser[3]; $subdomain = "$subuser[2]"; $domain = "$subuser[0].$subuser[1]"; } return ($host,$subdomain,$domain); } #================================================================== sub stats_summarize { #Summarize for the "all" user type. my ($ut); foreach $ut ('inhouse','gov','edu','com','majornet','foreign','other') { $Tot_User{'all'} += $Tot_User{$ut}; $Tot_Visit{'all'} += $Tot_Visit{$ut}; $Tot_Day{'all'} += $Tot_Day{$ut}; $Tot_Click{'all'} += $Tot_Click{$ut}; $Tot_Page{'all'} += $Tot_Page{$ut}; $Uv_0{'all'} += $Uv_0{$ut}; $Uv_1{'all'} += $Uv_1{$ut}; $Uv_2{'all'} += $Uv_2{$ut}; $Uv_3_5{'all'} += $Uv_3_5{$ut}; $Uv_gt5{'all'} += $Uv_gt5{$ut}; $Ud_1{'all'} += $Ud_1{$ut}; $Ud_2{'all'} += $Ud_2{$ut}; $Ud_3_5{'all'} += $Ud_3_5{$ut}; $Ud_gt5{'all'} += $Ud_gt5{$ut}; } #Summarize for the "outside" user type. foreach $ut ('gov','edu','com','majornet','foreign','other') { $Tot_User{'outside'} += $Tot_User{$ut}; $Tot_Visit{'outside'} += $Tot_Visit{$ut}; $Tot_Day{'outside'} += $Tot_Day{$ut}; $Tot_Click{'outside'} += $Tot_Click{$ut}; $Tot_Page{'outside'} += $Tot_Page{$ut}; $Uv_0{'outside'} += $Uv_0{$ut}; $Uv_1{'outside'} += $Uv_1{$ut}; $Uv_2{'outside'} += $Uv_2{$ut}; $Uv_3_5{'outside'} += $Uv_3_5{$ut}; $Uv_gt5{'outside'} += $Uv_gt5{$ut}; $Ud_1{'outside'} += $Ud_1{$ut}; $Ud_2{'outside'} += $Ud_2{$ut}; $Ud_3_5{'outside'} += $Ud_3_5{$ut}; $Ud_gt5{'outside'} += $Ud_gt5{$ut}; } return; } #================================================================== sub stats_totals { #Get totals for pages and bytes served. for ($i; $i < $#PageHits; $i++) { $Total_Hits_Actual += $PageHits[$i]; if ($PageHits[$i]) { ++$Pages_Hit_Actual; $Total_Bytes_Actual += $PageSize[$i] * $PageHits[$i]; } } return; } # ======================================================================== sub usage_die {# Show usage and exit w/ error open (USAGE,"$UsageFilePath") || die "Sorry, no usage file."; while () {print $_}; close (USAGE); exit; return; } # ======================================================================== sub usertrack { #Analyzes the record of 1 user. my ($user) = ($_[0]); my ($iv,@visit); my ($i,$j,$ic,$ir); my ($r,$dtime,$htime,$lastime,$ord,%order); my ($pagetype,$daypage,$day); my ($time,$page_number,$agent_number,$refer_number); my ($is_robot,$robotrecord); my (%markpagevisit); my (%markpageuser); my ($markfirst); my ($userpage,$uservisit,$userallvisit,$userclick,$userday,$useragent); my (%userinday); my (@userpagecount); my ($goodvisit); my ($line); my ($usertype); my ($newvisit); my ($nhit) = 0; #print "Starting user $user\n"; #$CountUserX++; #exit if ($CountUserX > 10); #See if this user is a robot. $is_robot = &isrobot($user); if ($is_robot) { if ($exclude_robots) {return} else {&robotwrite($user)} } elsif ($robots_only) {return} #Read the history. This is a time-sorted record of hits. #Actually, you are getting only the index to @{$UserRecord{$user}[$i]} foreach $r (@{$UserRecord{$user}}) { #Add a small sequential increment to distinguish same-second records $time = ${$r}[0] + .000001 * $ir; $order{$time} = $ir; #print " Sequence $ir, time = $time\n"; $ir++; } #print " User $user had $ir hits.\n"; foreach $htime (sort {$a <=> $b} (keys %order)) { $ic++; $userpage++; if ($lasttime) {$dtime = $htime - $lasttime} $lasttime = $htime; $ord = $order{$htime}; if ($dtime > $VisitLength && $ic > 1) {$newvisit = 'y'} elsif ($ic == $ir) {push (@visit,$ord); $newvisit = 'y'} else {$newvisit = 'n'} if ($newvisit eq 'y') { #Analyze each visit. $userallvisit++; $goodvisit = 0; $nhit = $#visit; #print " Visit $userallvisit hits 0 \- $nhit are:\n"; #Read through every page once. foreach $i (@visit) { ($time,$page_number,$agent_number,$refer_number) = @{$UserRecord{$user}[$i]}; #print " $i,$time,$page_number,$PageName[$page_number]\n"; $pagetype = &classpage($page_number); if (! $goodvisit && &iscontenttype($pagetype)) { #Mark this if the visit has content $goodvisit = 1; #print " Good visit.\n"; } $PageHits[$page_number]++; $userpagecount[$page_number]++; unless (exists $markpageuser{$page_number}) { #Mark each page only once per user $markpageuser{$page_number} = 1; $PageUser[$page_number]++; } $PageTypeHits{$pagetype}++; $daypage = int($time / 86400); unless (exists $userinday{$daypage}) { $userday++; $userinday{$daypage} = 1; #User had hits this day. } $DayHits[$daypage]++; #Hits on each day. } if ($goodvisit) { #Visit has non-image content. $uservisit++; #print " This is good visit number $uservisit\n"; undef %markpagevisit; $markfirst = 0; foreach $i (@visit) { ($time,$page_number,$agent_number,$refer_number) = @{$UserRecord{$user}[$i]}; unless (exists $markpagevisit{$page_number}) { #Mark each page only once per visit $markpagevisit{$page_number} = 1; $PageVisit[$page_number]++; } $pagetype = &classpage($page_number); if (&iscontenttype($pagetype)) { $userclick++; unless ($markfirst) { #Mark these for the first content page $pagefirst[$page_number]++; $ReferFirst[$refer_number]++; $AgentHit[$agent_number]++; $useragent = $agent_number; $markfirst = 1; } } } } @visit = (); } push (@visit,$ord); } #Tabulate for the user. $Count_Visit += $uservisit; #print " User $user had $uservisit visits\n"; $Count_AllVisit += $userallvisit; $usertype = &utype($user); $Tot_User{$usertype}++; #Total users by user class $Tot_Visit{$usertype} += $uservisit; #Total visits by user class $Tot_Click{$usertype} += $userclick; #Total clicks by user class $Tot_Page{$usertype} += $userpage; #Total pages by user class #Classify by user visits. if ($uservisit == 1) {$Uv_1{$usertype}++} elsif ($uservisit == 2) {$Uv_2{$usertype}++} elsif ($uservisit > 2 && $uservisit < 6) {$Uv_3_5{$usertype}++} elsif ($uservisit == 0) {$Uv_0{$usertype}++} else {$Uv_gt5{$usertype}++} #Classify by user days $Tot_Day{$usertype} += $userday; #Total days by user class if ($userday > 1 && $usertype ne 'inhouse') {$MultiDayUser++} #Multi-day users if ($userday == 1) {$Ud_1{$usertype}++} elsif ($userday == 2) {$Ud_2{$usertype}++} elsif ($userday > 2 && $userday < 6) {$Ud_3_5{$usertype}++} else {$Ud_gt5{$usertype}++} if ($UserFile) { #Write the userfile, if necessary. $line = "$user\t$usertype\t$userday\t$uservisit\t$userclick\t$userpage"; if ($Userfile_Page) { foreach $i (1..$Userfile_Page) {$line .= "\t$userpagecount[$i]"} } print USERLIST "$line\n"; } if ($RobotFile && $is_robot) {&robottally($user,$agent_number,$uservisit,$userpage)} return; } #================================================================== sub utype { #Classifies the user by category my ($user) = $_[0]; local ($include_it,$domain); #Classifies the $user into one of these categories # Returns the user type. if (&passinhouse($user)) {return 'inhouse'} #Classify outside domains. #Network and organization count as commercial if ($user =~ /\.prodigy\/com$|\.aol\.com$|\.compuserv\.com/i) {return 'majornet'} if ($user =~ /\.com$|\.net$|\.org$/i) {return 'com'} if ($user =~ /\.edu$/i) {return 'edu'} #Government, including military if ($user =~ /\.gov$|\.us$|\.mil$/i) {return 'gov'} #Foreign are letter codes not in the above. if ($user =~ /\D$/) {return 'foreign'} return 'other'; } #================================================================== #================================================================== # Subroutines that set up the variables and switches follow. #================================================================== # ======================================================================== sub beginendtime { #Beginning and ending time, in seconds my ($addsec); if ($DateBegin =~ /(\d?\d)(\D\D\D)(\d\d\d\d)(.*)/) { $DateBegin = $1 . '/' . $2 . '/' . $3 . ':00:00:00' . $4; } $Time_Begin = &ParseTime($DateBegin); if ($DateEnd =~ /(\d?\d)(\D\D\D)(\d\d\d\d)(.*)/) { $DateEnd = $1 . '/' . $2 . '/' . $3 . ':23:59:59' . $4; $addsec = 1; } $Time_End = &ParseTime($DateEnd) + $addsec; unless ($Time_Begin && $Time_End && $Time_End > $Time_Begin) {return 0} return 1; } # ======================================================================== sub proc_rc { # Process the .rc file local ($line,$action,$test,$rccommand,$action_text); local (@list); if ($opt_f) { open (RCFILE, $opt_f) || die "Can not open run-time control file \"$opt_f\".\n"; $rc_file = $opt_f; } # See if run-time file in current directory: else {return} while () { chomp; $rccommand = $_; #Skip comment and blank lines next if ($rccommand =~ /^\s*#/ || $rccommand =~ /^ *$/); $rccommand =~ s/^\s*//; #Remove white space from front. # Detect and set background color if ($rccommand =~ /^background.*(\S+)/i) {$BGColor = $1} # Detect and accumulate header lines elsif ($rccommand =~ /^header/i) { @list = &rc_list; $HTML_Header = join ("\n",@list); } # Detect and accumulate footer lines elsif ($rccommand =~ /^footer/i) { @list = &rc_list; $html_footer = join ("\n",@list); } # Detect and accumulate OK result codes elsif ($rccommand =~ /^ok.*results/i) { @list = &rc_list; foreach $line (@list) { next unless ($line =~ /^\d\d\d$/); push(@Ok_Codes,$line); } } # Detect and set report frequency elsif ($rccommand =~ /^report.*freq.*[ =]\s*(\d+)$/i) {$Rpt = $1} # Detect and set visit length elsif ($rccommand =~ /^visit.*length.*[ =]\s*(\d+)$/i) {$VisitLength = $1} # Detect and set temporary file name: elsif ($rccommand =~ /^temp.*file.*[ =]\s*(\S+)$/i) {$Templog_Base = $1} # Detect and set user file name: elsif ($rccommand =~ /^user.*file.*[ =]\s*(\S+)$/i) {$UserFile = $1} # Detect and set page file name: elsif ($rccommand =~ /^page.*file.*[ =]\s*(\S+)$/i) {$PageFile = $1} # Detect and set the number of pages to report in the user file. elsif ($rccommand =~ /^userfile.*page.*[ =]\s*(\d+)$/i) {$Userfile_Page = $1} # Detect and set robot file name: elsif ($rccommand =~ /^robot.*file.*[ =]\s*(\S+)$/i) {$RobotFile = $1} # Detect and set referrer file name: elsif ($rccommand =~ /^refer.*file.*[ =]\s*(\S+)$/i) {$ReferFile = $1} # Detect and set number of top hits to display for text pages: elsif ($rccommand =~ /^top.*text.*[ =]\s*(\d+)$/i) {$Tc_tp = $1} # Detect and set number of top hits to display for image pages: elsif ($rccommand =~ /^top.*image.*[ =]\s*(\d+)$/i) {$Tc_ip = $1} # Detect and set number of top hits to display for first contact pages: elsif ($rccommand =~ /^top.*first.*[ =]\s*(\d+)$/i) {$Tc_fr = $1} # Detect and set number of top hits to display for referrers: elsif ($rccommand =~ /^top.*refer.*[ =]\s*(\d+)$/i) {$Tc_rf = $1} # Detect and set number of top hits to display for referrers: elsif ($rccommand =~ /^top.*agent.*[ =]\s*(\d+)$/i) {$Tc_agent = $1} # Detect and accumulate page rules. elsif ($rccommand =~ /^page.*rules/i) { @list = &rc_list; foreach $line (@list) { if ($line =~ /^(\S+)\s+(.+)$/) { $action = $1; $test = $2; next unless ($action =~ /(^inc)|(^exc)/i); ++$n_page_rules; if ($n_page_rules == 1) {$rc_rules .= "\nPage rules:\n"} $page_test[$n_page_rules] = $test; if ($action =~ /^inc/) {$page_action[$n_page_rules] = 1} if ($page_action[$n_page_rules]) {$action_text = 'Include'} else {$action_text = 'Exclude'} $rc_rules .= " $n_page_rules $action_text $page_test[$n_page_rules]\n"; } } if ($n_page_rules) {$rc_rules .= " Default is to include.\n"} } # Detect and accumulate page alias. elsif ($rccommand =~ /^page.*alias/i) { @list = &rc_list; foreach $line (@list) { if ($line =~ /^(\S+)\s+(.+)$/) { ++$N_Page_Alias; if ($N_Page_Alias == 1) {$rc_rules .= "\nPage alias:\n"} $Page_Alias_Test[$N_Page_Alias] = $1; $Page_Alias[$N_Page_Alias] = $2; &pageassign($Page_Alias[$N_Page_Alias],0); #Record the page. $rc_rules .= " $N_Page_Alias $Page_Alias_Test[$N_Page_Alias] becomes $Page_Alias[$N_Page_Alias]\n"; } } } # Detect and accumulate required pages. elsif ($rccommand =~ /^page.*required/i) { @list = &rc_list; foreach $line (@list) { if ($line =~ /^(\S+)\s+(.+)$/) { $action = $1; $test = $2; next unless ($action =~ /(^inc)|(^exc)/i); ++$n_page_rules; if ($n_page_required_rules == 1) {$rc_rules .= "\nRequired pages:\n"} $page_required_test[$n_page_required_rules] = $test; if ($action =~ /^inc/) {$page_required_action[$n_page_required_rules] = 1} if ($page_required_action[$n_page_required_rules]) {$action_text = 'Include'} else {$action_text = 'Exclude'} $rc_rules .= " $n_page_required_rules $action_text $page_required_test[$n_page__requiredrules]\n"; } } if ($n_page_rules) {$rc_rules .= " Default is to exclude.\n"} } # Detect and set default domain (when host is unqualified): elsif ($rccommand =~ /^default.*domain/) { if ($rccommand =~ /(\S+)$/) {$Default_Domain = $1; $Default_Domain =~ s/=//} } # Detect and accumulate domain rules. elsif ($rccommand =~ /^domain.*rules/i) { @list = &rc_list; foreach $line (@list) { if ($line =~ /^(\S+)\s+(.+)$/) { $action = $1; $test = $2; next unless ($action =~ /(^inc)|(^exc)/i); ++$N_Domain_Rules; if ($N_Domain_Rules == 1) {$rc_rules .= "\nDomain rules:\n"} $Domain_Test[$N_Domain_Rules] = $test; if ($action =~ /^inc/) {$Domain_Action[$N_Domain_Rules] = 1} if ($Domain_Action[$N_Domain_Rules]) {$action_text = 'Include'} else {$action_text = 'Exclude'} $rc_rules .= " $N_Domain_Rules $action_text $Domain_Test[$N_Domain_Rules]\n"; } } if ($N_Domain_Rules) {$rc_rules .= " Default is to include.\n"} } # Detect and accumulate inhouse domain rules. elsif ($rccommand =~ /^inhouse.*rules/i) { @list = &rc_list; foreach $line (@list) { if ($line =~ /^(\S+)\s+(.+)$/) { $action = $1; $test = $2; next unless ($action =~ /(^inc)|(^exc)/i); ++$N_Inhouse_Rules; if ($N_Inhouse_Rules == 1) {$rc_rules .= "\nInhouse rules:\n"} $Inhouse_Test[$N_Inhouse_Rules] = $test; if ($action =~ /^inc/) {$Inhouse_Action[$N_Inhouse_Rules] = 1} if ($Inhouse_Action[$N_Inhouse_Rules]) {$action_text = 'Include'} else {$action_text = 'Exclude'} $rc_rules .= " $N_Inhouse_Rules $action_text $Inhouse_Test[$N_Inhouse_Rules]\n"; } } if ($N_Inhouse_Rules) {$rc_rules .= " Default is to exclude.\n"} } # Detect and accumulate referrer rules. elsif ($rccommand =~ /^refer.*rules/i) { @list = &rc_list; foreach $line (@list) { if ($line =~ /^(\S+)\s+(.+)$/) { $action = $1; $test = $2; next unless ($action =~ /(^inc)|(^exc)/i); ++$N_Refer_Rules; if ($N_Refer_Rules == 1) {$rc_rules .= "\nRefer rules:\n"} $Refer_Test[$N_Refer_Rules] = $test; if ($action =~ /^inc/) {$Refer_Action[$N_Refer_Rules] = 1} if ($Refer_Action[$N_Refer_Rules]) {$action_text = 'Include'} else {$action_text = 'Exclude'} $rc_rules .= " $N_Refer_Rules $action_text $Refer_Test[$n_refer_rules]\n"; } } if ($n_refer_rules) {$rc_rules .= " Default is to include.\n"} } # Detect and accumulate referrer alias. elsif ($rccommand =~ /^refer.*alias/i) { @list = &rc_list; foreach $line (@list) { if ($line =~ /^(\S+)\s+(.+)$/) { ++$N_Refer_Alias; if ($N_Refer_Alias == 1) {$rc_rules .= "\nRefer alias:\n"} $Refer_Alias_Test[$N_Refer_Alias] = $1; $Refer_Alias[$N_Refer_Alias] = $2; $rc_rules .= " $N_Refer_Alias $Refer_Alias_Test[$N_Refer_Alias] becomes $Refer_Alias[$N_Refer_Alias]\n"; } } } # Detect and set options; elsif ($rccommand =~ /^option.*[ =]\s*(\S+)$/i) { $line = $1; $Skip_Agents = 1 if ($line =~ /A/); $skip_rules =1 if ($line =~ /C/); $skip_defs = 1 if ($line =~ /D/); $skip_hitinfo = 1 if ($line =~ /H/); $skip_pages = 1 if ($line =~ /P/); $NoQuestion = 1 if ($line =~ /Q/); $Skip_Refers = 1 if ($line =~ /R/); $skip_days = 1 if ($line =~ /T/); $skip_users = 1 if ($line =~ /U/); $exclude_robots = 1 if ($line =~ /Y/); $skip_robots = 1 if ($line =~ /Z/); $Multilog = 1 if ($line =~ /m/); $robots_only = 1 if ($line =~ /y/); } else { print "\nWARNING - unrecognizeable line in the .rc file \"$rc_file\".\n"; print "The line is:\n"; print "$rccommand\n"; print "The line will be skipped.\n\n"; } } close (RCFILE); return; } # ======================================================================== sub rc_list { my ($line); my (@list); while () { chomp; $line = $_; if ($line =~ /^\.$/) {return @list} next if ($line =~ /^\s*#/ || $line =~ /^ *$/); push (@list,$line); } return @list; } # ======================================================================== sub readcommandline { #Reads the command line my ($path,$error); #Account for arguments while ($ARGV[0] =~ /^-/) {shift(ARGV)} #Starting and ending DDMMMYYYYZZZZZ $DateBegin = shift(@ARGV); $DateEnd = shift(@ARGV); #Server Name $Server = shift(@ARGV); $Server =~ s/_/ /; #Output file $Path_Report = shift(@ARGV); unless ($Path_Report =~ /\.html$/) {$Path_Report .= ".html"} unless ($Path_Report =~ /^>/) {$Path_Report = ">$Path_Report"} #Input files unless ($ARGV[0]) {print "Wrong number of arguments.\n"; &usage_die} @input_log = (@ARGV); foreach $path (@input_log) { unless (-r $path) {$error = 1; print "Can not read $path\n"} } if ($error) {&usage_die} return; } # ======================================================================== sub readoptions { #Reads the options require "getopts.pl"; do Getopts('ef:l:p:r:st:u:z:') || &usage_die; # Report the status every $Rpt lines read. $Rpt = $opt_p if $opt_p; # Temp file name: $Templog_Base = $opt_t if $opt_t; #User List if ($opt_u) {$UserFile = $opt_u} #Page List if ($opt_l) {$PageFile = $opt_l} #Robot List if ($opt_z) {$RobotFile = $opt_z} #Referrer List if ($opt_r) {$ReferFile = $opt_r} return; } #================================================================== #================================================================== # Subroutines that produce the output follow. # Present in order of output. #================================================================== #================================================================== sub show_banner { #Prints the html banner. print OUTFILE "\n"; print OUTFILE "\n"; print OUTFILE "",$Server," Server Statistics\n"; print OUTFILE "\n"; print OUTFILE "\n"; print OUTFILE "$HTML_Header\n"; print "Header completed.\n" if ! $opt_s; return; } #================================================================== sub show_summary_block { #Prints the summary block. my ($pt,$cptotal); my ($title); my ($outside_users,$pctmultiday,$totmbytes); $title = $title_rc ? $title_rc : "Log analysis for server $Server
"; print OUTFILE "

$title\n"; print OUTFILE "$DateBegin - $DateEnd

\n"; print OUTFILE "
\n";
  printf OUTFILE "%s%10d\n", "Pages served                        = ",$Total_Hits_Actual;
  foreach $pt (@PageTypeClassOrder) {
    if (&iscontenttype($pt)) {$cptotal += $PageTypeHits{$pt}}
  }
  printf OUTFILE "%s%10d\n", "Content pages served (views)        = ",$cptotal;
  printf OUTFILE "%s%10d\n", "Pages accessed at least once        = ",$Pages_Hit_Actual;
  printf OUTFILE "%s%10d\n", "Visits                              = ",$Count_AllVisit;
  printf OUTFILE "%s%10d\n", "Content visits                      = ",$Count_Visit;
  printf OUTFILE "%s%10d\n", "Unique addresses served             = ",$Tot_User{'all'};
  $outside_users = $Tot_User{'all'} - $Tot_User{'inhouse'};
  printf OUTFILE "%s%10d\n", "Outside addresses served            = ",$outside_users;
  if ($outside_users) {
    $pctmultiday = $MultiDayUser * 100.0 / $outside_users;
    printf OUTFILE "%s%10d%s%4.1f%s", "Outside, served on more than 1 day  = ",
      $MultiDayUser," (",$pctmultiday,"%)\n";
  }
  $totmbytes = $Total_Bytes_Actual / 1000000;
  printf OUTFILE "Megabytes served                    = %10d\n", $totmbytes;
  if ($Total_Hits_Actual) {
    printf OUTFILE "Average page size                   = %10d\n", $Total_Bytes_Actual / $Total_Hits_Actual}
  print OUTFILE "
\n"; return; } #================================================================== sub show_user_pie { #Prints the user pie info my ($usertype); my ($pctpage,$pctvisit,$pctuser); print OUTFILE "
\n"; print OUTFILE "

Characteristics of Users Accessing This Server

\n"; print OUTFILE "

Service by user category:

\n"; print OUTFILE "
\n";
  print OUTFILE "                     Pages Served          Visits       Addresses\n";
  $usertype = 'inhouse';
  $pctpage=$Tot_Page{'all'} > 0 ? $Tot_Page{$usertype}*100.0 / $Tot_Page{'all'}:0;
  $pctvisit=$Tot_Visit{'all'} > 0 ? $Tot_Visit{$usertype}*100.0 / $Tot_Visit{'all'}:0;
  $pctuser=$Tot_User{'all'} > 0 ? $Tot_User{$usertype}*100.0 / $Tot_User{'all'}:0;
  printf OUTFILE 
    "In-house:         %8d (%4.1f%%)  %6d (%4.1f%%)  %6d (%4.1f%%)\n",
    $Tot_Page{$usertype},$pctpage,$Tot_Visit{$usertype},$pctvisit,$Tot_User{$usertype},
    $pctuser;
  print OUTFILE "Outside users:\n";
  foreach $usertype ('gov','edu','com','majornet','foreign','other') {
    $pctpage=$Tot_Page{'all'} > 0 ? $Tot_Page{$usertype}*100.0 / $Tot_Page{'all'}:0;
    $pctvisit=$Tot_Visit{'all'} > 0 ? $Tot_Visit{$usertype}*100.0 /$Tot_Visit{'all'}:0;
    $pctuser=$Tot_User{'all'} > 0 ? $Tot_User{$usertype}*100.0 / $Tot_User{'all'}:0;
    printf OUTFILE "%s%8d (%4.1f%%)  %6d (%4.1f%%)  %6d (%4.1f%%)\n",
      $ULong{$usertype},$Tot_Page{$usertype},$pctpage,$Tot_Visit{$usertype},
      $pctvisit,$Tot_User{$usertype},$pctuser;
  }
  print OUTFILE "
\n"; print "Basic statistics completed.\n" if ! $opt_s; return; } #================================================================== sub show_user_days { #Prints days per user my ($usertype,$pct_1,$pct_2,$pct_3_5,$pct_gt5,$avg); print OUTFILE "

Percent of users visiting for \"n\" days:

\n"; print OUTFILE "
\n";
  print OUTFILE "                    ------------------ days ------------------\n";
  print OUTFILE "                        1       2     3-5      >5 average days\n";
  foreach $usertype ('all','inhouse','outside','gov','edu','com','majornet','foreign','other') {
    if ($Tot_User{$usertype} > 0) {
      $pct_1 = $Ud_1{$usertype} * 100.0 / $Tot_User{$usertype};
      $pct_2 = $Ud_2{$usertype} * 100.0 / $Tot_User{$usertype};
      $pct_3_5 = $Ud_3_5{$usertype} * 100.0 / $Tot_User{$usertype};
      $pct_gt5 = $Ud_gt5{$usertype} * 100.0 / $Tot_User{$usertype};
      $avg = $Tot_Day{$usertype} / $Tot_User{$usertype};
    }
    else {
      $pct_1 = 0; $pct_2 = 0; $pct_3_5 = 0; $pct_gt5 = 0;
      $avg = 0;
    }
    printf OUTFILE "%s%7.1f  %7.1f %7.1f %7.1f %8.2f\n",
    $ULong{$usertype},$pct_1,$pct_2,$pct_3_5,$pct_gt5,$avg;
  }
  print OUTFILE "
\n"; return; } #================================================================== sub show_user_visits { #Print stats on visits my ($usertype,$pct_0,$pct_1,$pct_2,$pct_3_5,$pct_gt5,$avg); print OUTFILE "

Percent of users visiting \"n\" times:

\n"; print OUTFILE "
\n";
  print OUTFILE "                    -------------------content visits -------------------\n";
  print OUTFILE "                        0       1       2     3-5      >5  average visits\n";
  foreach $usertype ('all','inhouse','outside','gov','edu','com','majornet','foreign','other') {
    if ($Tot_User{$usertype} > 0) {
      $pct_0 = $Uv_0{$usertype} * 100.0 / $Tot_User{$usertype};
      $pct_1 = $Uv_1{$usertype} * 100.0 / $Tot_User{$usertype};
      $pct_2 = $Uv_2{$usertype} * 100.0 / $Tot_User{$usertype};
      $pct_3_5 = $Uv_3_5{$usertype} * 100.0 / $Tot_User{$usertype};
      $pct_gt5 = $Uv_gt5{$usertype} * 100.0 / $Tot_User{$usertype};
      $avg = $Tot_Visit{$usertype} / $Tot_User{$usertype};
    }
    else {
      $pct_0 = 0; $pct_1 = 0; $pct_2 = 0; $pct_3_5 = 0; $pct_gt5 = 0;
      $avg = 0;
    }
    printf OUTFILE "%s%7.1f %7.1f %7.1f %7.1f %7.1f %8.2f\n",
      $ULong{$usertype},$pct_0,$pct_1,$pct_2,$pct_3_5,$pct_gt5,$avg;
#    print OUTFILE "usertype $usertype\n";
#    print OUTFILE "$Uv_0{$usertype}, $Uv_1{$usertype}, $Uv_2{$usertype}, $Uv_3_5{$usertype}, $Uv_gt5{$usertype}\n";
#    print OUTFILE "$Tot_User{$usertype}, $Tot_Visit{$usertype}\n"; 
  }
  print OUTFILE "
\n"; return; } #================================================================== sub show_user_other { #Prints clicks per visit, images per page my ($usertype,$clickspervisit,$imagesperpage); print OUTFILE "

Other user information:

\n"; print OUTFILE "
\n";
  print OUTFILE "                    clicks/visit       images/page\n";
  foreach $usertype ('all','inhouse','outside','gov','edu','com','majornet','foreign','other') {
    $clickspervisit = $Tot_Visit{$usertype} > 0 ? 
      $Tot_Click{$usertype} / $Tot_Visit{$usertype} : 0;
    $imagesperpage = $Tot_Click{$usertype} > 0 ?
      ($Tot_Page{$usertype}-$Tot_Click{$usertype}) / $Tot_Click{$usertype} : 0;
    printf OUTFILE "%s      %6.2f              %6.2f\n",
    $ULong{$usertype},$clickspervisit,$imagesperpage;
  }
  print OUTFILE "
\n"; return; } #================================================================== sub show_robots { #Prints a table showing robot activity print OUTFILE "

Robot Summary

\n"; print OUTFILE "
\n";
  printf OUTFILE "%s%8d%s","Number of robots = ",$Robot_Count,"\n";
  printf OUTFILE "%s%8d%s","Visits by robots = ",$Robot_Visit,"\n";
  printf OUTFILE "%s%8d%s","Pages to robots  = ",$Robot_Page,"\n";
  print OUTFILE "
\n"; print "Robots completed.\n" if ! $opt_s; } # ======================================================================== sub show_pages_by_type { #Print Characteristics of Pages Served my ($i,$pct,$pt); print OUTFILE "
\n"; print OUTFILE "

Characteristics of Pages Served

\n"; print OUTFILE "

Pages served by page type:

\n"; print OUTFILE "
\n";
  foreach $pt (@PageTypeClassOrder) {
    if ($PageTypeHits{$pt}) {
      $pct = $PageTypeHits{$pt} * 100.0 / $Count_Success;
      printf OUTFILE "%8d%s%4.1f%s", $PageTypeHits{$pt}," (",$pct,"%) $PageTypeClass{$pt}\n";
    }
  }
  print OUTFILE "
\n"; return; } #================================================================== sub show_page_file { #Writes the page file my ($i); return unless ($PageFile); foreach $i (1..$TotPage) { print PAGELIST "$i\t$PageName[$i]\t$PageHits[$i]\n"; } return; } #================================================================== sub show_page_access { #Show pages in order of descending access. #(This looks complicated, but it is fast.) my (@templist); my (@tempval); my (@tempsel); my ($pagenum,$minhits_here); my ($i); $minhits_here = $MinHits; #Pre-screen while (1) { foreach $pagenum (1..$TotPage) { $pagetype = &classpage($pagenum); next if ($PageHits[$pagenum] < $minhits_here) || &classpage($pagenum) eq 'image'; push(@templist,$pagenum); push(@tempval,$PageHits[$pagenum]); } #Check that we've selected enough pages if ($#templist < ($Tc_tp - 1) && $minhits_here > 1) { #Excluded too many. Change $minhits_here. $minhits_here = int($minhits_here / 2); unless ($minhits_here) {$minhits_here = 1}; @templist = (); @tempval = (); @tempsel = (); } else { #Get a list oSf the top n hits. foreach $i (&topn($Tc_tp,@tempval)) {push (@tempsel,$templist[$i])} #Check that we have top n possible. last; } } $Tc_tp = $#tempsel + 1; print OUTFILE "

Top ",$Tc_tp," non-image pages accessed:

\n"; print OUTFILE "
\n";
  print OUTFILE "    hits   visits     users    page\n";
  foreach $pagenum (sort {$PageHits[$b] <=> $PageHits[$a]} @tempsel) {
    last unless ($PageHits[$pagenum]);
    printf OUTFILE "%8d%s%8d%s%8d%s%s%s", $PageHits[$pagenum]," ",$PageVisit[$pagenum]," ",$PageUser[$pagenum]," ",$PageName[$pagenum],"\n";
  }
  print OUTFILE "
\n"; print "Top $Tc_tp non-image pages completed.\n" if ! $opt_s; return; } #================================================================== sub show_page_access_image { #Show image pages in order of descending access. local (@templist); local (@tempval); local (@tempsel); local ($pagenum,$minhits_here); $minhits_here = $MinHits; #Pre-screen while (1) { foreach $pagenum (1..$TotPage) { $pagetype = &classpage($pagenum); next if ($PageHits[$pagenum] < $minhits_here) || &classpage($pagenum) ne 'image'; push(@templist,$pagenum); push(@tempval,$PageHits[$pagenum]); } #Check that we've selected enough pages if ($#templist < ($Tc_ip - 1) && $minhits_here > 1) { #Excluded too many. Change $minhits_here. $minhits_here = int($minhits_here / 2); unless ($minhits_here) {$minhits_here = 1}; @templist = (); @tempval = (); @tempsel = (); } else { #Get a list of the top n hits. foreach $i (&topn($Tc_ip,@tempval)) {push (@tempsel,$templist[$i])} #Check that we have top n possible. last; } } $Tc_ip = $#tempsel + 1; print OUTFILE "

Top ",$Tc_ip," image pages accessed:

\n"; print OUTFILE "
\n";
  foreach $pagenum (sort {$PageHits[$b] <=> $PageHits[$a]} @tempsel) {
    last unless ($PageHits[$pagenum]);
    printf OUTFILE "%8d%s%s%s", $PageHits[$pagenum]," ",$PageName[$pagenum],"\n";
  }
  print OUTFILE "
\n"; print "Top $Tc_ip image pages completed.\n" if ! $opt_s; return; } #================================================================== sub show_first_access { #Prints first access pages local (@templist); local (@tempval); local (@tempsel); local ($pagenum,$minhits_here); $minhits_here = $MinHits; #Pre-screen while (1) { foreach $pagenum (1..$#pagefirst) { $pagetype = &classpage($pagenum); next if ($PageFirst[$pagenum] < $minhits_here) || &classpage($pagenum) eq 'image'; push(@templist,$pagenum); push(@tempval,$PageFirst[$pagenum]); } #Check that we've selected enough pages if ($#templist < ($Tc_fr - 1) && $minhits_here > 1) { #Excluded too many. Change $minhits_here. $minhits_here = int($minhits_here / 2); unless ($minhits_here) {$minhits_here = 1}; @templist = (); @tempval = (); @tempsel = (); } else { #Get a list of the top n hits. foreach $i (&topn($Tc_fr,@tempval)) {push (@tempsel,$templist[$i])} #Check that we have top n possible. last; } } $Tc_fr = $#tempsel + 1; print OUTFILE "

Top ",$Tc_fr," \"first contact\" pages:

\n"; print OUTFILE "
\n";
  foreach $pagenum (sort { $PageFirst[$b] <=> $PageFirst[$a] } @tempsel) {
    last unless ($PageFirst[$pagenum]);
    printf OUTFILE "%8d%s%s%s", $PageFirst[$pagenum]," ",$PageName[$pagenum],"\n";
  }
  print OUTFILE "
\n"; print "Top $Tc_fr first-access pages completed.\n" if ! $opt_s; @templist = (); @tempval = (); @tempsel = (); return; } #================================================================== sub show_referers { #Print referers local (@templist); local (@tempval); local (@tempsel); local ($refernum,$minhits_here,$i); if ($ReferFile) { #Print the file of referrers for later analysis. open (REFERLIST,">$ReferFile"); foreach $refernum (1..$#refername) { printf REFERLIST "%8d%s%s%s", $ReferFirst[$refernum]," ",$ReferName[$refernum],"\n"; } close (REFERLIST); } $minhits_here = $MinHits; #Pre-screen while (1) { foreach $refernum (1..$#refername) { next if ($ReferFirst[$refernum] < $minhits_here); push(@templist,$refernum); push(@tempval,$ReferFirst[$refernum]); } #Check that we've selected enough pages if ($#templist < ($Tc_rf - 1) && $minhits_here > 1) { #Excluded too many. Change $minhits_here. $minhits_here = int($minhits_here / 2); unless ($minhits_here) {$minhits_here = 1}; @templist = (); @tempval = (); @tempsel = (); } else { #Get a list of the top n hits. foreach $i (&topn($Tc_rf,@tempval)) {push (@tempsel,$templist[$i])} #Check that we have top n possible. last; } } $Tc_rf = $#tempsel + 1; print OUTFILE "

Top ",$Tc_rf," referrers:

\n"; print OUTFILE "
\n";
  foreach $refernum (sort { $ReferFirst[$b] <=> $ReferFirst[$a] } @tempsel) {
    last unless ($ReferFirst[$refernum]);
    printf OUTFILE "%8d%s%s%s", $ReferFirst[$refernum]," ",$ReferName[$refernum],"\n";
  }
  print OUTFILE "
\n"; print "Top $Tc_rf referrers completed.\n" if ! $opt_s; @templist = (); @tempval = (); @tempsel = (); return; } #================================================================== sub show_agents { #Prints agents local (@ahold); local ($sumvisits,$cumvisits,$pct,$cumpct); #Compute total visits for cumulative percent. foreach $agentnum (1..$#agenthit) {$sumvisits += $AgentHit[$agentnum]} return unless ($sumvisits); print OUTFILE "
\n"; print OUTFILE "

Characteristics of Agents

\n"; print OUTFILE "There were $#agenthit agents.\n"; @ahold = &topn($Tc_agents,@agenthit); print OUTFILE "

Top ",$Tc_agents," agents:

\n"; print OUTFILE "
\n";
  print OUTFILE "  Visits   Pct  Cum_pct  Agent\n";
  foreach $agentnum (sort {$AgentHit[$b] <=> $AgentHit[$a]} @ahold) {
    last if $AgentHit[$agentnum] == 0;
    $cumvisits += $AgentHit[$agentnum];
    $pct = 100.0 * $AgentHit[$agentnum] / $sumvisits;
    $cumpct = 100.0 * $cumvisits / $sumvisits;
    printf OUTFILE "%8d%s%5.1f%s%8.1f%s%s%s", $AgentHit[$agentnum],' ',$pct,' ',$cumpct,'  ',$AgentName[$agentnum],"\n";
  }
  print OUTFILE "
\n"; print "Top $Tc_agents agents completed.\n" if ! $opt_s; return; } #================================================================== sub show_days { #Prints out the hits per day. local ($numday); print OUTFILE "
\n"; print OUTFILE "

Characteristics of Access Days

\n"; print OUTFILE "

Pages served per day:

\n"; if ($#dayhits > 365) { print OUTFILE "More than 365 days with hits.\n"; return 0; } print OUTFILE "
\n";
  print OUTFILE "    day      pages\n";
  for ($i = 0; $i <= $#dayhits; $i++){
    $numday = $i + 1;
    printf OUTFILE "%8d%10d%s",$numday,$DayHits[$i],"\n";
  }
  print OUTFILE "
\n"; return; } #================================================================== sub show_rules { #Prints the selection and alias rules print OUTFILE "
\n"; print OUTFILE "

Selection and/or Alias Rules for This Analysis

\n"; print OUTFILE "
\n";
  print OUTFILE "$rc_rules";
  print OUTFILE "
\n"; return; } #================================================================== sub show_defs { #Print the definitions of categories. print OUTFILE "
\n"; print OUTFILE "

Definitions

\n"; print OUTFILE "
\n"; print OUTFILE "
User\n"; print OUTFILE "
For purposes of this analysis, a \"user\" is equated with a unique domain name.\n"; print OUTFILE "Some domains are shared by two or more people. Some people have two or more domains.\n"; print OUTFILE "
Page\n"; print OUTFILE "
A single file requested from the webserver.\n"; print OUTFILE "The file may be text, images, or data.\n"; print OUTFILE "Sometimes called a \"hit.\"\n"; print OUTFILE "A request to execute a script also counts as a page.\n"; print OUTFILE "
Content page.\n"; print OUTFILE "
Any page except an image, an imagemap, or a cascading style sheet."; print OUTFILE "Sometimes called a \"\;view\"\; or a \"\;click\"\;"; print OUTFILE "because the user typically has to click the mouse to see this page.\n"; print OUTFILE "
Visit\n"; print OUTFILE "
Any number of requests from a single user,\n"; print OUTFILE "each of which follows the previous one by no more than $VisitLength seconds.\n"; print OUTFILE "
Content visit\n"; print OUTFILE "A visit consisting of at least one content page."; print OUTFILE "Some visits consist only of images, usually called as inline images"; print OUTFILE "from a page on another site. Therefore, a content visit means"; print OUTFILE "that a user deliberately chose to come to this site.\n"; print OUTFILE "
\"First contact\" Page\n"; print OUTFILE "
The first content page accessed in a visit.\n"; print OUTFILE "
Referer\n"; print OUTFILE "
URL from which the user requested the first content page of a visit.\n"; print OUTFILE "
Agent\n"; print OUTFILE "
Software (browser) used for the first content page of a visit.\n"; print OUTFILE "
Government Domain\n"; print OUTFILE "
Includes users in \"gov\", \"us\", or \"mil\" domains.\n"; print OUTFILE "
Commercial Domain\n"; print OUTFILE "
Includes users in the \"com\", \"org\", or \"net\" domains, but not major networks.\n"; print OUTFILE "
Major Network Domain\n"; print OUTFILE "
Includes users in the \"aol.com\", \"prodigy.com\", or \"compuserve.com\" domains.\n"; print OUTFILE "

\n"; return; } #================================================================== sub show_footer { #This is the last stuff printed. #Note the program and version print OUTFILE "

\n"; print OUTFILE "Analysis by log_profile.pl version ",$version,"\n"; #Footer print OUTFILE "


\n"; print OUTFILE "$html_footer\n"; print OUTFILE "\n"; print OUTFILE "\n"; return; } #================================================================== #================================================================== # The following routines could be placed in a separate library. #================================================================== # ======================================================================== # ======================================================================== sub delta_day { #Computes time difference in integer days, from times in seconds. local ($time1,$time2) = (@_); local ($days); if ($time1 > $time2) {return 0} $days = ($time2-$time1) / 86400; $days = int($days); return $days; } # ================================================================== sub ParseTime { #Parses the time string within the square brackets in a log. my ($string) = $_[0]; my ($time); my ($day, $mona, $year, $hour, $min, $sec, $offset, $mon); $string =~ s/ //g; unless ($string =~ /^(\d\d)\/?(...)\/?(\d\d\d\d)\:(\d\d)\:(\d\d)\:(\d\d)(.*)/) {return 0}; $day = $1; $mona = $2; $year = $3; $hour = $4; $min = $5; $sec = $6; $offset = $7; $offset =~ s/ //g; if ($offset) {$LastOffset = $offset} else {$offset = $LastOffset} $mona =~ tr/[A-Z]/[a-z]/; $mon = $AlphaMonthToDeciMonth{$mona}; $time = &TimeSeconds($year, $mon, $day, $hour, $min, $sec, $offset); return $time; } # ======================================================================== sub get_user_from_log { #Extracts the user name from a log-like string. local ($record) = $_[0]; local ($user); if ($record =~ /^(\S+) /){$user = $1} return $user; } # ======================================================================== sub readoffset { #Reads offset from GMT local ($offset) = $_[0]; if ($offset == $SaveOffset) {return $OffsetSeconds}; $SaveOffset = $offset; local ($plusminus, $off_hr, $off_min); unless ($offset =~ /^(.)(\d\d)(\d\d)$/) {return 0} $plusminus=$1; $off_hr=$2 ;$off_min=$3; if ($off_hr > 24 || $off_min > 60) {return 0}; unless ($plusminus =~ /[ \+-]/) {return 0}; $OffsetSeconds = $off_hr * 3600 + $off_min * 60; if ($plusminus =~ /-/) {$OffsetSeconds = -1 * $OffsetSeconds} return $OffsetSeconds; } # ================================================================== sub TimeSeconds { #Computes time in seconds since Jan 1, 1970 GMT #Month is 1,2...12. #Requires the package Time::Local; my ($year, $mon, $day, $hour, $min, $sec, $offset) = (@_); my ($time); my ($offhr,$offmin,$offsgn); $year = int $year; $mon = int $mon; $day = int $day; $hour = int $hour; $min = int $min; unless ($year >= 1970 && $mon > 0 && $day > 0) {return 0} unless ($mon <= 12 && $day <= 31) {return 0} unless ($hour < 24 && $min < 60 && $sec < 60) {return 0} unless ($hour >= 0 && $min >= 0 && $sec >= 0) {return 0} $mon--; $year -= 1900; $time = timegm($sec,$min,$hour,$day,$mon,$year); if ($offset) { if ($offset =~ s/$\-//) {$offsgn = -1} else {$offsgn = 1} if ($offset =~ /^(\d\d)(\d\d)$/) { $offhr = $1; $offmin = $2; if ($offhr < 25 && $offmin < 60) { $time += $offsgn * (3600 * $offhr + 60 *$offmin); } } } return $time; } # ======================================================================== sub topn { #Returns lowest qualifying value for a list of "top n" #Arguments: # $n = number of qualifying values desired # @a = input array of numbers or values # @selected = Returned array of n indexes, no order. local ($n,@a) = @_; local (@possible) = (0..$n); local (@selected); local ($i,$m,$cutoff,@t); $m = $n - 1; if ($m > $#a) {$m = $#a} for ($i = 0; $i <= $m; $i++) {$t[$i] = $a[$i]} @t = sort {$a <=> $b} (@t); for ($i = $m; $i <= $#a; $i++) { if ($a[$i] > $t[0]) { $t[0] = $a[$i]; @t = sort {$a <=> $b} (@t) } } $cutoff = $t[0]; #Lowest qualifying value for ($i = 0; $i <= $#a; $i++) { if ($a[$i] >= $cutoff) {push(@selected,$i)} } return (@selected); } # ======================================================================== #(Last Line)