Iterate through all categories and get details of all images of each category in COMMONS.WIKIMEDIA.ORG using PHP
WEB SCRAPING
<?php
$url = 'https://commons.wikimedia.org/wiki/Category:1600_paintings_by_artist';//Initialisation URL
scrapeData($url);
function file_get_contents_curl($url)
{
$curl = curl_init($url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
$page = curl_exec($curl);
//use $page variable to match tags for required data of image
return $page;
}
//
function scrapeData($url)
{
$var = fread_url($url);
preg_match_all ("/a[\s]+[^>]*?href[\s]?=[\s\"\']+".
"(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/",
$var, $matches);
$matches = $matches[1];
$count=0;
foreach($matches as $var)
{
if(strpos(str_replace('/'," ",$var),'wiki File:') !== false)//find image(s) of a subcategory
{
file_get_contents_curl('https://commons.wikimedia.org'.$var);//call this function to get details of image
$count++;
}
}
if($count == 0)
{
foreach($matches as $var)
{
if(strpos(str_replace('/'," ",$var),'wiki Category:') !== false)//iterate through all subcategories
{
scrapeData('https://commons.wikimedia.org'.$var);
}
}
}
}
//
function fread_url($url,$ref="")
{
if(function_exists("curl_init")){
$ch = curl_init();
$user_agent = "Mozilla/4.0 (compatible; MSIE 5.01; ".
"Windows NT 5.0)";
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
curl_setopt( $ch, CURLOPT_HTTPGET, 1 );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 );
curl_setopt( $ch, CURLOPT_URL, $url );
curl_setopt( $ch, CURLOPT_REFERER, $ref );
curl_setopt ($ch, CURLOPT_COOKIEJAR, 'cookie.txt');
$html = curl_exec($ch);
curl_close($ch);
}
else{
$hfile = fopen($url,"r");
if($hfile){
while(!feof($hfile)){
$html.=fgets($hfile,1024);
}
}
}
return $html;
}
?>
Comments
Post a Comment