PHP Classes

File: class.protector.inc.php

Recommend this page to a friend!
  Classes of Nico den Boer   Anti-Harvest Download   class.protector.inc.php   Download  
File: class.protector.inc.php
Role: Class source
Content type: text/plain
Description: class file
Class: Anti-Harvest Download
Serve files for download preventing harvesting
Author: By
Last change: small bugfix
Date: 13 years ago
Size: 8,524 bytes
 

Contents

Class file image Download
<?php
/**
 * <p>
 * Script to safely serve a file for download.
 * And, most important, block downloads for bots to save bandwidth on our server
 * </p>
 *
 * <p>
 * No harvesting allowed!
 * </p>
 *
 * @author Nico den Boer <nico@nicodenboer.com>, <www.nicodenboer.com>, <www.denboer-ims.nl>
 * @version 1.0.1
 * @package AntiHarvestDownload
 */

/**
 * Class to do the actual work
 *
 * @package AntiHarvestDownload
 */
class protector {
   
/**
     * Holds user agents
     *
     * @var array
     * @access private
     */
   
private $_agents;
   
/**
     * Holds hosts
     *
     * @var array
     * @access private
     */
   
private $_hosts;
   
/**
     * Holds path to actual files
     *
     * @var string
     * @access private
     */
   
private $_path;
   
/**
     * Determines if we use predis or not
     *
     * @var bool
     * @access private
     */
   
private $_predisUse;
   
/**
     * Determines path to predis class
     *
     * @var string
     * @access private
     */
   
private $_predisPath;
   
/**
     * Holds the time in seconds that visits will expire
     *
     * @var int
     * @access private
     */
   
private $_predisTime;
   
/**
     * Holds the number of visits which are allowed within the timeframe
     *
     * @var int
     * @access private
     */
   
private $_predisCnt;
   
/**
     * Constructor
     *
     * @access public
     * @return void
     */
   
public function __construct() {
       
$this->_agents = array (
           
'wget', 'emailsiphon', 'webzip', 'msproxy/2.0',
           
'emailwolf', 'webbandit', 'ms frontpage', 'bot',
           
'slurp', 'scooter', 'spider', 'crawler',
           
'worm', 'internetseer.com', 'archiver', 'msnptc',
           
'libwww-perl', 'channel-index', 'linkwalker', 'holmes',
           
'holmes', 'yeti', 'indexer'
           
);
       
$this->_hosts = array(
           
'bot', 'spider', 'crawler', 'yandex'
           
);
       
$this->_path = '';
    }
   
/**
     * Set (partial) names of user agents, which we know they are bots.
     * Names should be in lowercase.
     *
     * @access public
     * @param array $agents
     * @return void
     */
   
public function setAgents($agents) {
       
$this->_agents = $agents;
    }
   
/**
     * Set (partial) names of which we know as bots or harvesters.
     * Names should be in lowercase.
     *
     * @access public
     * @param array $hosts
     * @return void
     */
   
public function setHosts($hosts) {
       
$this->_hosts = $hosts;
    }
   
/**
     * Set the path to the physical downloads
     *
     * @access public
     * @param string $str
     * @return void
     */
   
public function setPath($str) {
       
$this->_path = $str;
    }
   
/**
     * Initialize predis
     *
     * @access public
     * @param string $path
     * @param int $time
     * @param int $cnt
     * @return void
     */
   
public function setPredis($path, $time, $cnt) {
       
$this->_predisUse = true;
       
$this->_predisPath = $path;
       
$this->_predisTime = $time;
       
$this->_predisCnt = $cnt;
    }
   
/**
     * Process the request
     *
     * @access public
     * @return void
     */
   
public function process() {

       
// File to download
       
$file = $_GET['file'];

       
// Clean up the file name
       
$file = basename($file);
       
$file = html_entity_decode($file, ENT_COMPAT, 'UTF-8'); // just in case

       
$fullPath = $this->_path . $file;
       
$continue = file_exists($fullPath);
        if (!
$continue) {
       
// File does not exist, return error code
           
header('HTTP/1.0 404 Not Found');
            echo
'<h1>File does not exist</h1>';
        }
        else {
       
// Determine extension
           
$pos = strrpos($file, '.');
           
$continue = $pos !== false;
        }

        if (
$continue) {
       
// Determine extension and mime type
           
$ext = strtolower(substr($file, $pos + 1));
            switch(
$ext) {
            case
'pdf': $ctype='application/pdf'; break;
            case
'zip': $ctype='application/zip'; break;
            case
'doc': $ctype='application/msword'; break;
            case
'xls': $ctype='application/vnd.ms-excel'; break;
            case
'ppt': $ctype='application/vnd.ms-powerpoint'; break;
            case
'gif': $ctype='image/gif'; break;
            case
'png': $ctype='image/png'; break;
            case
'jpeg':
            case
'jpg': $ctype='image/jpg'; break;
            case
'mpeg':
            case
'mpg':
            case
'mpe': $ctype='video/mpeg'; break;
            case
'mov': $ctype='video/quicktime'; break;
            case
'avi': $ctype='video/x-msvideo'; break;
            case
'mp3': $ctype='audio/mpeg'; break;
            case
'wav': $ctype='audio/x-wav'; break;
            case
'xml': $ctype='text/xml'; break;
            case
'txt': $ctype='text/plain'; break;
            case
'7z':
            case
'exe': $ctype='application/octet-stream'; break;
            default :
$continue = false;
            }
            if (!
$continue) {
               
// Extension not known.
                   
header('HTTP/1.0 404 Not Found');
                    echo
'<h1>Filename not accepted</h1>';
            }
        }

        if (
$continue) {
       
// See if the user agent can pass our test
           
if (!isset($_SERVER['HTTP_USER_AGENT'])) $_SERVER['HTTP_USER_AGENT'] = '';
           
$tmp = $_SERVER['HTTP_USER_AGENT'];
            foreach (
$this->_agents as $value) {
                if (
strlen($tmp) == 0 || stripos($tmp, $value) !== false) {
               
// Found a user agent which we need to refuse
                   
$continue = false;
                    break;
                }
            }
        }

        if (
$continue) {
       
// See if the host can pass our test
           
$isBot = false;
            if (!isset(
$_SERVER['REMOTE_ADDR'])) $_SERVER['REMOTE_ADDR'] = '';
           
$tmp = gethostbyaddr($_SERVER['REMOTE_ADDR']);
            foreach (
$this->_hosts as $value) {
                if (!isset(
$tmp) || strlen($tmp) == 0 || stripos($tmp, $value) !== false) {
               
// Found a host which we need to refuse
                   
$continue = false;
                   
$isBot = true;
                    break;
                }
            }
        }
        else {
       
// User agent not allowed
           
$isBot = true;
        }

        if (
$continue && $predisUse) {
           
/**
            * Predis client
            */
           
require_once($predisPath);
           
$pkey = 'download_' . $_SERVER['REMOTE_ADDR']; // our checks are bound to IP numbers
            // the next line will generate an error with old PHP versions.
            // uncomment only if you have a recent PHP version and you want to use this functionality
// $redis = new Predis\Client();
           
$tmp = $redis->get($pkey);
            if (!isset(
$tmp) || strlen($tmp) == 0) {
           
// New visitor
               
$tmp = array();
            }
            else {
           
// Previous visits found
               
$tmp = json_decode($tmp, true);
            }
           
// Register this visit
           
$now = time();
           
array_push($tmp, array(
               
'time' => $now,
               
'file' => $file
               
));
           
// See what we need to write back and if we have a harvesting bot
           
$write = array();
            foreach (
$tmp as $value) {
                if (
$value['time'] > ($time - $predisTime)) {
                   
$write[] = $value; // Keep it in the stack
               
}
            }
           
// Write visits to redis
           
$redis->set($pkey, json_encode($write));
           
$isBot = count($write) > $predisCnt;
        }

        if (
$isBot) {
           
header('HTTP/1.0 403 Forbidden');
            echo
'<h1>Not authorized</h1>
<p>
    We have detected that you are (behaving like) a bot, which is not respecting the content
    of robots.txt and harvesting files from our site.
</p>
<p>
    Please correct this behavior, since it is considered quite impolite and disrespectful
    to ignore explicit directions by web site owners, put into place according to
    <a href="http://www.robotstxt.org/robotstxt.html">worldwide standards</a>.
</p>'
;
        }
        elseif(
$continue) {
       
// It seems we can send this file ...
            // Required for IE, otherwise Content-disposition is ignored
           
if (ini_get("zlib.output_compression"))
               
ini_set("zlib.output_compression", "Off");
           
header('Pragma: public'); // required
           
header('Last-Modified: ' . gmdate('D, d M Y H:i:s', filemtime($fullPath)) . ' GMT');
           
header('Expires: 0');
           
header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
           
header('Cache-Control: private',false); // required for certain browsers
           
header(sprintf('Content-Disposition: attachment; filename="%s";', $file));
           
// Send Content-Transfer-Encoding HTTP header
            // (use binary to prevent files from being encoded/messed up during transfer)
           
header('Content-Transfer-Encoding: binary');
           
header('Content-Length: ' . filesize($fullPath));
           
header('Content-Type: ' . $ctype);
           
header('Content-Description: File Transfer');
           
readfile($fullPath);
        }
    }
}

/**
 * Utility function. Handle a error...
 *
 * Basically, we don't handle errors in this script, but will ignore them :)
 *
 * The only error we can expect, is retrieving the host name using gethostbyaddr()
 *
 * @param int $errorNr
 * @param string $errorMessage
 * @param string $errorFile
 * @param int $errorLine
 * @return void
 * @package AntiHarvestDownload
 */
function errorHandler($errorNr, $errorMessage, $errorFile, $errorLine) {
}

set_error_handler('errorHandler');

?>