countGetData = true; #$spider->output = 'sitemap.xml'; $spider->crawl(); $spider->output(); class spider { var $site = ''; var $limit = 0; var $sitemap = ''; var $crawled = array(); var $linkReg = ']*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>'; var $headersReg = '#HTTP/\d\.\d.*?$.*?\r\n\r\n#ims'; var $allowed = '/^.*\.(html|htm|php|asp|aspx|pdf)$/i'; var $numPages = 0; var $countGetData = false; var $countHashLinks = false; var $output = ''; function spider($site = '', $limit = 0) { $this->site = $site; $this->limit = $limit; if (! $this->site) { die('No site supplied.'); } } function crawl() { $this->sitemap = "\n"; $this->sitemap .= "\n"; $this->getUrl($this->site); $this->sitemap .= "\n"; } function getUrl($url) { # echo "getting " . $url . "
"; if ($data = $this->retr($url)) { # Find headers and split them off preg_match_all($this->headersReg, $data['contents'], $matches); $headers = split("\r\n", str_replace("\r\n\r\n", '', array_pop($matches[0]))); $html = preg_replace($this->headersReg, '', $data['contents']); $dateMod=0; foreach($headers as $header){ $date = explode(": ", $header); if ($date[0]=='Last-Modified'){ $dateMod = $date[1]; } } $this->crawled[] = $url; $this->addUrl($url, strtotime($dateMod)); if (preg_match_all("/" . $this->linkReg . "/siU", $html, $matches)) { foreach ($matches[2] as $newurl) { if (! strstr($newurl, "http://") && ! strstr($newurl, "https://")) { # Need to add support for ./ and ../ relative urls if(substr($newurl, 0, 1) !== '/' && substr($newurl, 0, 1) !== '.'){ $newurl = $this->site . '/' . $newurl; } else{ $newurl = $this->site . $newurl; } } else { if (! strstr($newurl, $this->site)) { continue; } } if ($this->countGetData === false) { $newurl = explode("?", $newurl); $newurl = $newurl[0]; } if ($this->countHashLinks === false) { $newurl = explode("#", $newurl); $newurl = $newurl[0]; } $newurl = html_entity_decode($newurl); if (! in_array($newurl, $this->crawled) && (preg_match($this->allowed, $newurl) || $this->containsDot($newurl) === false) && ! preg_match("/.*mailto.*$/i", $newurl)) { if ($this->numPages <= $this->limit || $limit == 0) { $this->getUrl($newurl); } else { break; } } } } } else { return false; } } function containsDot($str) { $str = strrev($str); $str = substr($str, 0, 5); if (strstr($str, ".")) { return true; } else { return false; } } function addUrl($url, $modified=0, $priority=0, $changefreq='weekly') { $url = $this->cleanUrl($url); $this->numPages ++; $this->sitemap .= " \n"; $this->sitemap .= " " . $url . "\n"; if ($changefreq){ $this->sitemap .= " " . $changefreq . "\n"; } if ($modified > 0){ $this->sitemap .= " " . date('Y-m-d', $modified) . "T" . date('H:i:s', $modified) . "+00:00" . "\n"; # 2009-08-06T03:05:17+00:00 } $this->sitemap .= " \n"; if ($this->output) { echo "Added " . $url . "\n"; flush(); ob_flush(); } return true; } function cleanUrl($url){ #$url = str_replace("Â", "", $url); #$url = str_replace("£", "£", $url); return $url; } function retr($url, $timeout = 1, $useragent = 'cURL', $headers = true, $followRedirects = false, $debug = true, $headersOnly = false) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_USERAGENT, $useragent); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); if ($headers === true) { curl_setopt($ch, CURLOPT_HEADER, 1); } if ($headersOnly) { curl_setopt($ch, CURLOPT_NOBODY, 1); } if ($followRedirects == true) { curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); } if ($debug == true) { $result['contents'] = curl_exec($ch); $result['info'] = curl_getinfo($ch); if ($result['info']['http_code'] != 200) { curl_close($ch); return false; } } else { $result = curl_exec($ch); } curl_close($ch); return $result; } function output() { if (! $this->output) { //header('Content-disposition: attachment; filename=sitemap.xml'); header('Content-Type: text/xml'); header('Content-Length: ' . strlen($this->sitemap)); echo $this->sitemap; } else { $fh = @fopen($this->output, 'w'); @fputs($fh, $this->sitemap); @fclose($fh); echo "Sitemap generated and saved to \"" . $this->output . "\"."; } } } ?>