Forum Moderators: coopster
for($page=0;$page<2;$page++) for($page=0;$page<53;$page++) to go over all pages.
<?php
$url = "http://finance.yahoo.com/q/cp?s=^IXIC&c=";
# create all URLs
$urls = array();
for($page=0;$page<2;$page++)
$urls[$page] = $url . $page;
# open all URLs with a curl_multi request (goes a lot faster then individual curl requests)
$pages = getPages($urls);
# walk through the array of returned HTML pages
$arrMinedData = array();
foreach($pages as $html) {
# do your preg_match_all stuff here....
# be warned, the next preg match stuff is very ugly... should really be switched with your version.....
preg_match_all('|<td class="yfnc_tabledata1"(.*)>(.*)</td>|Uim', $html, $matches);
# with the above preg_match, we need the data from $matches[2]
# with array_chunk we can bind together the <TD>'s that belong to the same <TR> (five cells per row)
$matches = array_chunk($matches[2], 5);
foreach($matches as $i => $arr) {
# walk through all 'rows'
foreach($arr as $num => $data) {
# strip out unwanted HTML
$matches[$i][$num] = trim(strip_tags($data));
}
# and add the info to the mining array....
$arrMinedData[] = $matches[$i];
}
}
# here you go.... all data in 1 nice array....
echo "<pre>";
print_r($arrMinedData);
echo "</pre>";
function getPages($arrUrls) {
$mh = curl_multi_init();
$threads = null;
foreach ($arrUrls as $page => $url) {
$c[$page]=curl_init($url);
curl_setopt ($c[$page], CURLOPT_TIMEOUT,600);
curl_setopt ($c[$page], CURLOPT_RETURNTRANSFER,1);
curl_multi_add_handle ($mh,$c[$page]);
}
$t1 = time();
do {
$n=curl_multi_exec($mh,$threads);
if (time() > $t1 + 2) {
echo "keep-alive" ."<br/>";
$t1 = time();
}
}
while ($threads > 0);
$arrData = array();
foreach ($arrUrls as $page => $url) {
curl_multi_remove_handle($mh,$c[$page]);
$html = curl_multi_getcontent($c[$page]);
$arrData[$page] = $html;
curl_close($c[$page]);
}
curl_multi_close($mh);
return $arrData;
}
?>