<?php
error_reporting(E_ALL);
$visited = array();
$result = array();
$id = 0;
function recive($url) {
$cr = curl_init();
curl_setopt($cr, CURLOPT_URL, 'http://62.153.147.220'.$url); // Set URL
echo 'Recive: '.'http://62.153.147.220'.$url.PHP_EOL;
curl_setopt($cr, CURLOPT_RETURNTRANSFER, true); // Get returned value as string (don't put to screen)
curl_setopt($cr, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)');
curl_setopt($cr, CURLOPT_COOKIEFILE, 'cookie.txt');
$output = curl_exec($cr);
curl_close($cr);
// echo $output;
return $output;
}
function parseCats($page, $parent, $level = 1) {
global $visited ,$result, $id;
if(in_array($page, $visited))
return;
$visited[] = $page;
$data = recive($page);
$data = substr($data, strpos($data, '<div id="contentpane">'));
if(strpos($data, 'Produktgruppe') !== false)
return;
if(strrpos($data, '<div class="panel_baugru_no_gru">') !== false) {
$data = substr($data, strrpos($data, '<div class="panel_baugru_no_gru">'));
}
preg_match_all("|<a href=\"([^\"]+)\">([^<]+)<\/a>|U", $data, $links);
foreach($links[2] as $key => $text) {
if($text == 'here')
continue;
$links[1][$key] = str_replace('&', '&', $links[1][$key]);
if(ereg('^[0-9]{4}$', $text)){
// year. skip it.
}elseif(ereg('^Weitere\ .*$', $text)) {
// next page
echo 'next page';
parseCats($links[1][$key], $parent);
}elseif(ereg('^1.Seite\ .*', $text)){
// first page. skip it.
}else{
// it's cat
$id++;
$result[$id] = array('parent' => $parent, 'id' => $id, 'link' => $links[1][$key], 'title' => $links[2][$key]);
echo $level.' => '.$id.' '.$links[2][$key].PHP_EOL;
parseCats($links[1][$key], $id, $level+1);
}
}
}
parseCats('/redesign/v10/default.aspx?10=0078139211741030006001&14=1&12=100', 0);
file_put_contents('result.ser', serialize($result));
echo 'All done!!!';
?>