diff options
Diffstat (limited to 'scraper')
| -rw-r--r-- | scraper/yahoo_japan.php | 1162 |
1 files changed, 1162 insertions, 0 deletions
diff --git a/scraper/yahoo_japan.php b/scraper/yahoo_japan.php new file mode 100644 index 0000000..2406fad --- /dev/null +++ b/scraper/yahoo_japan.php @@ -0,0 +1,1162 @@ +<?php + +class yahoo_japan{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("yahoo_japan"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + return []; + } + + private function get($proxy, $url, $get = [], $return_cookies = false, $is_xhr = false, $cookie = null){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + + if($cookie !== null){ + + $c = []; + foreach($cookie as $name => $value){ + + $c[] = "{$name}=$value"; + } + + $cookie = implode("; ", $c); + } + + if($is_xhr){ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: application/json, text/plain, */*", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://search.yahoo.co.jp/", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Cookie: " . $cookie, + "Sec-Fetch-Dest: empty", + "Sec-Fetch-Mode: cors", + "Sec-Fetch-Site: same-origin", + "TE: trailers"] + ); + }else{ + + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i", + "TE: trailers"] + ); + } + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + if($return_cookies){ + + // extract cookies + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + + } + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + + if($return_cookies){ + + return [ + "cookies" => $cookies_tmp, + "body" => $data + ]; + } + + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$url, $proxy] = $this->backend->get($get["npt"], "web"); + $params = []; + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + $url = "https://search.yahoo.co.jp/search"; + $params = [ + "p" => $get["s"] + ]; + } + + try{ + $html = $this->get( + $proxy, + $url, + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + //$html = file_get_contents("scraper/yahoo_japan.html"); + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $json_object = + explode( + '<script id="__NEXT_DATA__" type="application/json">', + $html + ); + + if(count($json_object) !== 2){ + + throw new Exception("Failed to find JSON script"); + } + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $json_object[1] + ), + true + ); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + //print_r($json); + + // + // Extract mainline search results + // + if(!isset($json["props"]["pageProps"]["initialProps"]["pageData"]["algos"])){ + + throw new Exception("Failed to access algos object"); + } + + foreach($json["props"]["pageProps"]["initialProps"]["pageData"]["algos"] as $result){ + + switch($result["type"]){ + + case "Algo": + if(isset($result["visualWebImageGallery"]["imageThumbs"][0]["source"])){ + + $thumb = [ + "ratio" => "1:1", + "url" => $result["visualWebImageGallery"]["imageThumbs"][0]["source"] + ]; + }elseif(isset($result["visualWebImageSnippet"])){ + + $thumb = [ + "ratio" => "1:1", + "url" => $result["visualWebImageSnippet"] + ]; + }else{ + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $sublinks = []; + + if(isset($result["megaSiteSubLinks"]["mssl"])){ + + foreach($result["megaSiteSubLinks"]["mssl"] as $sublink){ + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $sublink["title"] + ) + ), + "description" => + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $sublink["description"] + ) + ) + ), + "url" => $sublink["url"], + "date" => null + ]; + } + } + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $result["title"] + ) + ), + "description" => + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $result["description"] + ) + ) + ), + "url" => $result["url"], + "date" => isset($result["bylinedate"]) ? (int)$result["bylinedate"] : null, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => [] + ]; + + if(isset($result["anotherSuggest"]["exploreQueries"])){ + + foreach($result["anotherSuggest"]["exploreQueries"] as $query){ + + $out["related"][] = $query["query"]; + } + } + break; + } + } + + // + // Extract extras from "shortcuts" + // + foreach($json["props"]["pageProps"]["initialProps"]["pageData"]["shortcuts"] as $shortcut_wrap){ + + foreach($shortcut_wrap as $shortcut){ + + switch($shortcut["type"]){ + // + // Scrape videos + // + case "GoogleVideoUniversalShortcut": + foreach($shortcut["videos"] as $video){ + + if(isset($video["thumbnailUrl"])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $video["thumbnailUrl"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + if(isset($video["publishedDate"])){ + + $date = strtotime($video["publishedDate"]); + + if($date === false){ + + $date = null; + } + }else{ + + $date = null; + } + + $out["video"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $video["title"] + ) + ), + "description" => null, + "date" => $date, + "duration" => + isset($video["duration"]) ? + $this->hms2int($video["duration"]) : null, + "views" => null, + "thumb" => $thumb, + "url" => $video["url"] + ]; + } + break; + + // + // Scrape images + // + case "ImageShortcut": + foreach($shortcut["images"] as $image_cat){ + + foreach($image_cat as $image){ + + $ratio = + $this->yahooratio( + (int)$image["originalImageWidth"], + (int)$image["originalImageHeight"] + ); + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $image["title"] + ) + ), + "source" => [ + [ + "url" => $image["originalImageUrl"], + "width" => (int)$image["originalImageWidth"], + "height" => (int)$image["originalImageHeight"] + ], + [ + "url" => $image["thumbnailUrl"], + "width" => $ratio[0], + "height" => $ratio[1] + ] + ], + "url" => $image["referrerUrl"] + ]; + } + } + break; + + case "GoogleRelatedQuestionsShortcut": + foreach($shortcut["relatedQuestions"] as $question){ + + // + // Scrape answers, present them as search results + // + if(isset($question["result"]["thumbnailsInfo"][0]["thumbnailUrl"])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $question["result"]["thumbnailsInfo"][0]["thumbnailUrl"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $question["result"]["title"] + ) + ), + "description" => + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $question["result"]["answer"] + ) + ) + ), + "url" => $question["result"]["url"], + "date" => null, + "type" => "web", + "thumb" => $thumb, + "sublink" => [], + "table" => [] + ]; + } + break; + + case "NewsShortcut": + foreach($shortcut["results"] as $news){ + + if(isset($news["imageUrl"])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $news["imageUrl"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["news"][] = [ + "title" => $news["headLine"], + "description" => + $this->fuckhtml + ->getTextContent( + $news["text"] + ), + "date" => (int)$news["publishTime"], + "thumb" => $thumb, + "url" => $news["newsLink"] + ]; + } + break; + } + } + } + + // get next page + if(isset($json["props"]["pageProps"]["initialProps"]["pageData"]["pager"]["nextPage"])){ + + $out["npt"] = + $this->backend->store( + $json["props"]["pageProps"]["initialProps"]["pageData"]["pager"]["nextPage"], + "web", + $proxy + ); + } + + $out["related"] = array_unique($out["related"]); + + return $out; + } + + public function image($get){ + + $out = [ + "status" => "ok", + "npt" => null, + "image" => [] + ]; + + if($get["npt"]){ + + // parse JSON endpoint + // https://search.yahoo.co.jp/image/api/search + // ?p=minecraft + // &ei=UTF-8 + // &n=20 + // &b=41 + // &vm=i + // &cr=AiXVLGkAPLAUueqkG0dtUP6lo_3suz4Qsrv2QjabeXt4sk1wT8irS3LLvkSPRm-u7T1wvkE1ucQvhzYuB2QtDkjswUogRjoQVx_p73BaN3P1klQUFsnIPdgAttusXE0ii0pOcYCT + // &se=0 + // &ue=0 + [$params, $proxy] = $this->backend->get($get["npt"], "images"); + + $params = json_decode($params, true); + + // increment + $params["params"]["b"] += 20; + + try{ + $json = $this->get( + $proxy, + "https://search.yahoo.co.jp/image/api/search", + $params["params"], + false, + true, + $params["cookies"] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(isset($json["Error"]["Message"])){ + + throw new Exception("API returned an error: {$json["Error"]["Message"]}"); + } + + foreach($json["algos"] as $image){ + + $out["image"][] = [ + "title" => $this->titledots($image["title"]), + "source" => [ + [ + "url" => $image["original"]["url"], + "width" => (int)$image["original"]["width"], + "height" => (int)$image["original"]["height"] + ], + [ + "url" => $image["thumbnail"]["url"], + "width" => (int)$image["thumbnail"]["width"], + "height" => (int)$image["thumbnail"]["height"] + ] + ], + "url" => $image["refererUrl"] + ]; + } + + // detect next page + if($json["algoAttribute"]["resultsIsLast"] === false){ + + $out["npt"] = + $this->backend->store( + json_encode($params), + "images", + $proxy + ); + } + + }else{ + + // parse initial page + $params = [ + "p" => $get["s"], + "ei" => "UTF-8" + ]; + + $proxy = $this->backend->get_ip(); + try{ + $html = $this->get( + $proxy, + "https://search.yahoo.co.jp/image/search", + $params, + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + //$html = file_get_contents("scraper/yahoo_japan.html"); + + $json_object = + explode( + '<script id="__NEXT_DATA__" type="application/json">', + $html["body"] + ); + + if(count($json_object) !== 2){ + + throw new Exception("Failed to find JSON script"); + } + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $json_object[1] + ), + true + ); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(!isset($json["props"]["initialProps"]["pageProps"]["algos"])){ + + throw new Exception("Failed to access algos object"); + } + + foreach($json["props"]["initialProps"]["pageProps"]["algos"] as $image){ + + $out["image"][] = [ + "title" => $this->titledots($image["title"]), + "source" => [ + [ + "url" => $image["original"]["url"], + "width" => (int)$image["original"]["width"], + "height" => (int)$image["original"]["height"] + ], + [ + "url" => $image["thumbnail"]["url"], + "width" => (int)$image["thumbnail"]["width"], + "height" => (int)$image["thumbnail"]["height"] + ] + ], + "url" => $image["refererUrl"] + ]; + } + + // get next page + if( + $json["props"]["initialProps"]["pageProps"]["algoAttribute"]["resultsIsLast"] === false && + isset($json["props"]["initialProps"]["pageProps"]["crumb"]["crumbValue"]) + ){ + + $out["npt"] = + $this->backend->store( + json_encode([ + "params" => [ + "p" => $get["s"], + "ei" => "UTF-8", + "n" => 20, // number of results + "b" => 1, // increment (+20 on every page) + "vm" => "i", + "cr" => $json["props"]["initialProps"]["pageProps"]["crumb"]["crumbValue"], + "se" => 0, + "ue" => 0 + ], + "cookies" => $html["cookies"] + ]), + "images", + $proxy + ); + } + } + + return $out; + } + + public function video($get){ + + $out = [ + "status" => "ok", + "npt" => null, + "video" => [], + "author" => [], + "livestream" => [], + "playlist" => [], + "reel" => [] + ]; + + if($get["npt"]){ + + // parse JSON endpoint + // https://search.yahoo.co.jp/image/api/search + // ?p=minecraft + // &ei=UTF-8 + // &n=20 + // &b=41 + // &vm=i + // &cr=AiXVLGkAPLAUueqkG0dtUP6lo_3suz4Qsrv2QjabeXt4sk1wT8irS3LLvkSPRm-u7T1wvkE1ucQvhzYuB2QtDkjswUogRjoQVx_p73BaN3P1klQUFsnIPdgAttusXE0ii0pOcYCT + // &se=0 + // &ue=0 + [$params, $proxy] = $this->backend->get($get["npt"], "images"); + + $params = json_decode($params, true); + + // increment + $params["params"]["b"] += 20; + + try{ + $json = $this->get( + $proxy, + "https://search.yahoo.co.jp/video/api/search", + $params["params"], + false, + true, + $params["cookies"] + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(isset($json["Error"]["Message"])){ + + throw new Exception("API returned an error: {$json["Error"]["Message"]}"); + } + + foreach($json["algos"] as $video){ + + if(isset($video["uploadDate"])){ + + $date = strtotime($video["uploadDate"]); + if($date === false){ + + $date = null; + } + + }else{ + + $date = null; + } + + if(isset($video["thumbnail"]["url"])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $video["thumbnail"]["url"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["video"][] = [ + "title" => $this->titledots($video["title"]), + "description" => + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $video["summary"] + ) + ) + ), + "author" => [ + "name" => + ( + isset($video["uploader"]) && + $video["uploader"] != "" + ) ? + $video["uploader"] : null, + "url" => null, + "avatar" => null + ], + "date" => $date, + "duration" => + ( + isset($video["duration"]) && + $video["duration"] != "" + ) ? + $this->hms2int($video["duration"]) : null, + "views" => null, + "thumb" => $thumb, + "url" => $video["refererUrl"] + ]; + } + + // detect next page + if($json["algoAttribute"]["isLast"] === false){ + + $out["npt"] = + $this->backend->store( + json_encode($params), + "images", + $proxy + ); + } + + }else{ + + // parse initial page + $params = [ + "p" => $get["s"], + "ei" => "UTF-8" + ]; + + $proxy = $this->backend->get_ip(); + try{ + $html = $this->get( + $proxy, + "https://search.yahoo.co.jp/video/search", + $params, + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + //$html = file_get_contents("scraper/yahoo_japan.html"); + + $json_object = + explode( + '<script id="__NEXT_DATA__" type="application/json">', + $html["body"] + ); + + if(count($json_object) !== 2){ + + throw new Exception("Failed to find JSON script"); + } + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $json_object[1] + ), + true + ); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(!isset($json["props"]["initialProps"]["pageProps"]["algos"])){ + + throw new Exception("Failed to access algos object"); + } + + foreach($json["props"]["initialProps"]["pageProps"]["algos"] as $video){ + + if(isset($video["uploadDate"])){ + + $date = strtotime($video["uploadDate"]); + if($date === false){ + + $date = null; + } + + }else{ + + $date = null; + } + + if(isset($video["thumbnail"]["url"])){ + + $thumb = [ + "ratio" => "16:9", + "url" => $video["thumbnail"]["url"] + ]; + }else{ + + $thumb = [ + "ratio" => null, + "url" => null + ]; + } + + $out["video"][] = [ + "title" => $this->titledots($video["title"]), + "description" => + $this->titledots( + html_entity_decode( + $this->fuckhtml + ->getTextContent( + $video["summary"] + ) + ) + ), + "author" => [ + "name" => + ( + isset($video["uploader"]) && + $video["uploader"] != "" + ) ? + $video["uploader"] : null, + "url" => null, + "avatar" => null + ], + "date" => $date, + "duration" => + ( + isset($video["duration"]) && + $video["duration"] != "" + ) ? + $this->hms2int($video["duration"]) : null, + "views" => null, + "thumb" => $thumb, + "url" => $video["refererUrl"] + ]; + } + + // get next page + if( + $json["props"]["initialProps"]["pageProps"]["algoAttribute"]["isLast"] === false && + isset($json["props"]["initialProps"]["pageProps"]["crumb"]["crumbValue"]) + ){ + + $out["npt"] = + $this->backend->store( + json_encode([ + "params" => [ + "n" => 20, // number of results + "b" => 1, // increment (+20) + "vm" => "i", + "cr" => $json["props"]["initialProps"]["pageProps"]["crumb"]["crumbValue"], + "p" => $get["s"], + "pd" => "", + "dr" => "", + "hq" => "", + "st" => "", + "qrw" => "", + "ei" => "UTF-8", + "ue" => "0", + "se" => "0" + ], + "cookies" => $html["cookies"] + ]), + "images", + $proxy + ); + } + } + + return $out; + } + + public function news($get){ + + if($get["npt"]){ + + [$params, $proxy] = $this->backend->get($get["npt"], "news"); + $params = json_decode($params, true); + + }else{ + + $search = $get["s"]; + if(strlen($search) === 0){ + + throw new Exception("Search term is empty!"); + } + + $proxy = $this->backend->get_ip(); + + $params = [ + "p" => $get["s"], + "ei" => "UTF-8" + ]; + } + + try{ + $html = $this->get( + $proxy, + "https://chiebukuro.yahoo.co.jp/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch search page"); + } + //$html = file_get_contents("scraper/yahoo_japan.html"); + + $out = [ + "status" => "ok", + "npt" => null, + "news" => [] + ]; + + $json_object = + explode( + 'window.PROPS = ', + $html + ); + + if(count($json_object) !== 2){ + + throw new Exception("Failed to find JSON script"); + } + + $json = + json_decode( + $this->fuckhtml + ->extract_json( + $json_object[1] + ), + true + ); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + if(!isset($json["listSearchResults"]["listContents"])){ + + throw new Exception("Yahoo! did not return a listContents object"); + } + + foreach($json["listSearchResults"]["listContents"] as $news){ + + $date = strtotime($news["datePosted"]); + + if($date === false){ + + $date = null; + } + + $thumb = [ + "ratio" => null, + "url" => null + ]; + + $out["news"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $news["heading"] + ) + ), + "author" => null, + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $news["summary"] + ) + ), + "date" => $date, + "thumb" => $thumb, + "url" => $news["url"] + ]; + } + + // get next page + if($json["pagination"]["currentPage"] != $json["pagination"]["totalNumberOfPages"]){ + + if(!isset($params["b"])){ + + $params["b"] = 1; + } + + $params["b"] += 10; + + $out["npt"] = + $this->backend->store( + json_encode($params), + "news", + $proxy + ); + } + + return $out; + } + + private function hms2int($time){ + + $parts = explode(":", $time, 3); + $time = 0; + + if(count($parts) === 3){ + + // hours + $time = $time + ((int)$parts[0] * 3600); + array_shift($parts); + } + + if(count($parts) === 2){ + + // minutes + $time = $time + ((int)$parts[0] * 60); + array_shift($parts); + } + + // seconds + $time = $time + (int)$parts[0]; + + return $time; + } + + private function titledots($title){ + + $substr = substr($title, -3); + + if( + $substr == "..." || + $substr == "…" + ){ + + return trim(substr($title, 0, -3)); + } + + return trim($title); + } + + private function yahooratio($width, $height){ + + $ratio = [ + 144 / $width, + 256 / $height + ]; + + if($ratio[0] < $ratio[1]){ + + $ratio = $ratio[0]; + }else{ + + $ratio = $ratio[1]; + } + + return [ + floor($width * $ratio), + floor($height * $ratio) + ]; + } +} |
