Iterate through all categories and get details of all images of each category in COMMONS.WIKIMEDIA.ORG using PHP

                                                               
                                                     WEB SCRAPING

<?php   
$url = 'https://commons.wikimedia.org/wiki/Category:1600_paintings_by_artist';//Initialisation URL
scrapeData($url);
function file_get_contents_curl($url)
{
    $curl = curl_init($url);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
    curl_setopt($curl,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
    $page = curl_exec($curl);
    //use $page variable to match tags for required data of image 
    return $page;
}
//
function scrapeData($url)
{
    $var = fread_url($url);
    preg_match_all ("/a[\s]+[^>]*?href[\s]?=[\s\"\']+".
                    "(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/",
                    $var, $matches);
    $matches = $matches[1];
    $count=0;
    foreach($matches as $var)
    { 
       if(strpos(str_replace('/'," ",$var),'wiki File:') !== false)//find image(s) of a subcategory
       {
           file_get_contents_curl('https://commons.wikimedia.org'.$var);//call this function to get details of image
           $count++;
       }     
    }
    if($count == 0)
    {
        foreach($matches as $var)
        {
            if(strpos(str_replace('/'," ",$var),'wiki Category:') !== false)//iterate through all subcategories
            {
                scrapeData('https://commons.wikimedia.org'.$var);
            }   
        }   
    }   
}
//
function fread_url($url,$ref="")
{
    if(function_exists("curl_init")){
        $ch = curl_init();
        $user_agent = "Mozilla/4.0 (compatible; MSIE 5.01; ".
            "Windows NT 5.0)";
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
        curl_setopt( $ch, CURLOPT_HTTPGET, 1 );
        curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 );
        curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 );
        curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 );
        curl_setopt( $ch, CURLOPT_URL, $url );
        curl_setopt( $ch, CURLOPT_REFERER, $ref );
        curl_setopt ($ch, CURLOPT_COOKIEJAR, 'cookie.txt');
        $html = curl_exec($ch);
        curl_close($ch);
    }
    else{
        $hfile = fopen($url,"r");
        if($hfile){
            while(!feof($hfile)){
                $html.=fgets($hfile,1024);
            }
        }
    }
    return $html;
}
?>

Comments