aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-12-01 02:37:07 -0500
committerlolcat <will@lolcat.ca>2025-12-01 02:37:07 -0500
commit4f30a3ac39eba80122d8a91031934017ddb2e77b (patch)
tree0c5e2fb2aa8d12abe66a04eb7720173ca54639ac
parentbbfbe59ffb86e9c4399400c2e47bfad691826a96 (diff)
added yahoo japan, still missing the search filters though
-rw-r--r--lib/frontend.php4
-rw-r--r--scraper/yahoo_japan.php1162
-rw-r--r--settings.php16
3 files changed, 1182 insertions, 0 deletions
diff --git a/lib/frontend.php b/lib/frontend.php
index a454bab..04b08b6 100644
--- a/lib/frontend.php
+++ b/lib/frontend.php
@@ -943,6 +943,7 @@ class frontend{
"google" => "Google",
"google_api" => "Google API",
"google_cse" => "Google CSE",
+ "yahoo_japan" => "Yahoo! JAPAN",
"startpage" => "Startpage",
"qwant" => "Qwant",
"ghostery" => "Ghostery",
@@ -970,6 +971,7 @@ class frontend{
"brave" => "Brave",
"google" => "Google",
"google_cse" => "Google CSE",
+ "yahoo_japan" => "Yahoo! JAPAN",
"startpage" => "Startpage",
"qwant" => "Qwant",
"yep" => "Yep",
@@ -1001,6 +1003,7 @@ class frontend{
"brave" => "Brave",
"yandex" => "Yandex",
"google" => "Google",
+ "yahoo_japan" => "Yahoo! JAPAN",
"startpage" => "Startpage",
"qwant" => "Qwant",
"baidu" => "Baidu",
@@ -1017,6 +1020,7 @@ class frontend{
"ddg" => "DuckDuckGo",
"brave" => "Brave",
"google" => "Google",
+ "yahoo_japan" => "Yahoo! JAPAN",
"startpage" => "Startpage",
"qwant" => "Qwant",
"yep" => "Yep",
diff --git a/scraper/yahoo_japan.php b/scraper/yahoo_japan.php
new file mode 100644
index 0000000..2406fad
--- /dev/null
+++ b/scraper/yahoo_japan.php
@@ -0,0 +1,1162 @@
+<?php
+
+class yahoo_japan{
+
+ public function __construct(){
+
+ include "lib/backend.php";
+ $this->backend = new backend("yahoo_japan");
+
+ include "lib/fuckhtml.php";
+ $this->fuckhtml = new fuckhtml();
+ }
+
+ public function getfilters($page){
+
+ return [];
+ }
+
+ private function get($proxy, $url, $get = [], $return_cookies = false, $is_xhr = false, $cookie = null){
+
+ $curlproc = curl_init();
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+
+ curl_setopt($curlproc, CURLOPT_URL, $url);
+
+ // http2 bypass
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
+ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
+
+ if($cookie !== null){
+
+ $c = [];
+ foreach($cookie as $name => $value){
+
+ $c[] = "{$name}=$value";
+ }
+
+ $cookie = implode("; ", $c);
+ }
+
+ if($is_xhr){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: application/json, text/plain, */*",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Referer: https://search.yahoo.co.jp/",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Connection: keep-alive",
+ "Cookie: " . $cookie,
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-origin",
+ "TE: trailers"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: same-origin",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=0, i",
+ "TE: trailers"]
+ );
+ }
+
+ curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
+ curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
+ curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+
+ $this->backend->assign_proxy($curlproc, $proxy);
+
+ if($return_cookies){
+
+ // extract cookies
+ $cookies_tmp = [];
+ curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
+
+ $length = strlen($header);
+
+ $header = explode(":", $header, 2);
+
+ if(trim(strtolower($header[0])) == "set-cookie"){
+
+ $cookie_tmp = explode("=", trim($header[1]), 2);
+
+ $cookies_tmp[trim($cookie_tmp[0])] =
+ explode(";", $cookie_tmp[1], 2)[0];
+ }
+
+ return $length;
+ });
+
+ }
+
+ $data = curl_exec($curlproc);
+
+ if(curl_errno($curlproc)){
+
+ throw new Exception(curl_error($curlproc));
+ }
+
+ curl_close($curlproc);
+
+ if($return_cookies){
+
+ return [
+ "cookies" => $cookies_tmp,
+ "body" => $data
+ ];
+ }
+
+ return $data;
+ }
+
+ public function web($get){
+
+ if($get["npt"]){
+
+ [$url, $proxy] = $this->backend->get($get["npt"], "web");
+ $params = [];
+
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $proxy = $this->backend->get_ip();
+
+ $url = "https://search.yahoo.co.jp/search";
+ $params = [
+ "p" => $get["s"]
+ ];
+ }
+
+ try{
+ $html = $this->get(
+ $proxy,
+ $url,
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ //$html = file_get_contents("scraper/yahoo_japan.html");
+
+ $out = [
+ "status" => "ok",
+ "spelling" => [
+ "type" => "no_correction",
+ "using" => null,
+ "correction" => null
+ ],
+ "npt" => null,
+ "answer" => [],
+ "web" => [],
+ "image" => [],
+ "video" => [],
+ "news" => [],
+ "related" => []
+ ];
+
+ $json_object =
+ explode(
+ '<script id="__NEXT_DATA__" type="application/json">',
+ $html
+ );
+
+ if(count($json_object) !== 2){
+
+ throw new Exception("Failed to find JSON script");
+ }
+
+ $json =
+ json_decode(
+ $this->fuckhtml
+ ->extract_json(
+ $json_object[1]
+ ),
+ true
+ );
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ //print_r($json);
+
+ //
+ // Extract mainline search results
+ //
+ if(!isset($json["props"]["pageProps"]["initialProps"]["pageData"]["algos"])){
+
+ throw new Exception("Failed to access algos object");
+ }
+
+ foreach($json["props"]["pageProps"]["initialProps"]["pageData"]["algos"] as $result){
+
+ switch($result["type"]){
+
+ case "Algo":
+ if(isset($result["visualWebImageGallery"]["imageThumbs"][0]["source"])){
+
+ $thumb = [
+ "ratio" => "1:1",
+ "url" => $result["visualWebImageGallery"]["imageThumbs"][0]["source"]
+ ];
+ }elseif(isset($result["visualWebImageSnippet"])){
+
+ $thumb = [
+ "ratio" => "1:1",
+ "url" => $result["visualWebImageSnippet"]
+ ];
+ }else{
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $sublinks = [];
+
+ if(isset($result["megaSiteSubLinks"]["mssl"])){
+
+ foreach($result["megaSiteSubLinks"]["mssl"] as $sublink){
+
+ $sublinks[] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $sublink["title"]
+ )
+ ),
+ "description" =>
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $sublink["description"]
+ )
+ )
+ ),
+ "url" => $sublink["url"],
+ "date" => null
+ ];
+ }
+ }
+
+ $out["web"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["title"]
+ )
+ ),
+ "description" =>
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $result["description"]
+ )
+ )
+ ),
+ "url" => $result["url"],
+ "date" => isset($result["bylinedate"]) ? (int)$result["bylinedate"] : null,
+ "type" => "web",
+ "thumb" => $thumb,
+ "sublink" => $sublinks,
+ "table" => []
+ ];
+
+ if(isset($result["anotherSuggest"]["exploreQueries"])){
+
+ foreach($result["anotherSuggest"]["exploreQueries"] as $query){
+
+ $out["related"][] = $query["query"];
+ }
+ }
+ break;
+ }
+ }
+
+ //
+ // Extract extras from "shortcuts"
+ //
+ foreach($json["props"]["pageProps"]["initialProps"]["pageData"]["shortcuts"] as $shortcut_wrap){
+
+ foreach($shortcut_wrap as $shortcut){
+
+ switch($shortcut["type"]){
+ //
+ // Scrape videos
+ //
+ case "GoogleVideoUniversalShortcut":
+ foreach($shortcut["videos"] as $video){
+
+ if(isset($video["thumbnailUrl"])){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" => $video["thumbnailUrl"]
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ if(isset($video["publishedDate"])){
+
+ $date = strtotime($video["publishedDate"]);
+
+ if($date === false){
+
+ $date = null;
+ }
+ }else{
+
+ $date = null;
+ }
+
+ $out["video"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $video["title"]
+ )
+ ),
+ "description" => null,
+ "date" => $date,
+ "duration" =>
+ isset($video["duration"]) ?
+ $this->hms2int($video["duration"]) : null,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" => $video["url"]
+ ];
+ }
+ break;
+
+ //
+ // Scrape images
+ //
+ case "ImageShortcut":
+ foreach($shortcut["images"] as $image_cat){
+
+ foreach($image_cat as $image){
+
+ $ratio =
+ $this->yahooratio(
+ (int)$image["originalImageWidth"],
+ (int)$image["originalImageHeight"]
+ );
+
+ $out["image"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $image["title"]
+ )
+ ),
+ "source" => [
+ [
+ "url" => $image["originalImageUrl"],
+ "width" => (int)$image["originalImageWidth"],
+ "height" => (int)$image["originalImageHeight"]
+ ],
+ [
+ "url" => $image["thumbnailUrl"],
+ "width" => $ratio[0],
+ "height" => $ratio[1]
+ ]
+ ],
+ "url" => $image["referrerUrl"]
+ ];
+ }
+ }
+ break;
+
+ case "GoogleRelatedQuestionsShortcut":
+ foreach($shortcut["relatedQuestions"] as $question){
+
+ //
+ // Scrape answers, present them as search results
+ //
+ if(isset($question["result"]["thumbnailsInfo"][0]["thumbnailUrl"])){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" => $question["result"]["thumbnailsInfo"][0]["thumbnailUrl"]
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["web"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $question["result"]["title"]
+ )
+ ),
+ "description" =>
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $question["result"]["answer"]
+ )
+ )
+ ),
+ "url" => $question["result"]["url"],
+ "date" => null,
+ "type" => "web",
+ "thumb" => $thumb,
+ "sublink" => [],
+ "table" => []
+ ];
+ }
+ break;
+
+ case "NewsShortcut":
+ foreach($shortcut["results"] as $news){
+
+ if(isset($news["imageUrl"])){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" => $news["imageUrl"]
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["news"][] = [
+ "title" => $news["headLine"],
+ "description" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $news["text"]
+ ),
+ "date" => (int)$news["publishTime"],
+ "thumb" => $thumb,
+ "url" => $news["newsLink"]
+ ];
+ }
+ break;
+ }
+ }
+ }
+
+ // get next page
+ if(isset($json["props"]["pageProps"]["initialProps"]["pageData"]["pager"]["nextPage"])){
+
+ $out["npt"] =
+ $this->backend->store(
+ $json["props"]["pageProps"]["initialProps"]["pageData"]["pager"]["nextPage"],
+ "web",
+ $proxy
+ );
+ }
+
+ $out["related"] = array_unique($out["related"]);
+
+ return $out;
+ }
+
+ public function image($get){
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "image" => []
+ ];
+
+ if($get["npt"]){
+
+ // parse JSON endpoint
+ // https://search.yahoo.co.jp/image/api/search
+ // ?p=minecraft
+ // &ei=UTF-8
+ // &n=20
+ // &b=41
+ // &vm=i
+ // &cr=AiXVLGkAPLAUueqkG0dtUP6lo_3suz4Qsrv2QjabeXt4sk1wT8irS3LLvkSPRm-u7T1wvkE1ucQvhzYuB2QtDkjswUogRjoQVx_p73BaN3P1klQUFsnIPdgAttusXE0ii0pOcYCT
+ // &se=0
+ // &ue=0
+ [$params, $proxy] = $this->backend->get($get["npt"], "images");
+
+ $params = json_decode($params, true);
+
+ // increment
+ $params["params"]["b"] += 20;
+
+ try{
+ $json = $this->get(
+ $proxy,
+ "https://search.yahoo.co.jp/image/api/search",
+ $params["params"],
+ false,
+ true,
+ $params["cookies"]
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch JSON");
+ }
+
+ $json = json_decode($json, true);
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ if(isset($json["Error"]["Message"])){
+
+ throw new Exception("API returned an error: {$json["Error"]["Message"]}");
+ }
+
+ foreach($json["algos"] as $image){
+
+ $out["image"][] = [
+ "title" => $this->titledots($image["title"]),
+ "source" => [
+ [
+ "url" => $image["original"]["url"],
+ "width" => (int)$image["original"]["width"],
+ "height" => (int)$image["original"]["height"]
+ ],
+ [
+ "url" => $image["thumbnail"]["url"],
+ "width" => (int)$image["thumbnail"]["width"],
+ "height" => (int)$image["thumbnail"]["height"]
+ ]
+ ],
+ "url" => $image["refererUrl"]
+ ];
+ }
+
+ // detect next page
+ if($json["algoAttribute"]["resultsIsLast"] === false){
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode($params),
+ "images",
+ $proxy
+ );
+ }
+
+ }else{
+
+ // parse initial page
+ $params = [
+ "p" => $get["s"],
+ "ei" => "UTF-8"
+ ];
+
+ $proxy = $this->backend->get_ip();
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://search.yahoo.co.jp/image/search",
+ $params,
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ //$html = file_get_contents("scraper/yahoo_japan.html");
+
+ $json_object =
+ explode(
+ '<script id="__NEXT_DATA__" type="application/json">',
+ $html["body"]
+ );
+
+ if(count($json_object) !== 2){
+
+ throw new Exception("Failed to find JSON script");
+ }
+
+ $json =
+ json_decode(
+ $this->fuckhtml
+ ->extract_json(
+ $json_object[1]
+ ),
+ true
+ );
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ if(!isset($json["props"]["initialProps"]["pageProps"]["algos"])){
+
+ throw new Exception("Failed to access algos object");
+ }
+
+ foreach($json["props"]["initialProps"]["pageProps"]["algos"] as $image){
+
+ $out["image"][] = [
+ "title" => $this->titledots($image["title"]),
+ "source" => [
+ [
+ "url" => $image["original"]["url"],
+ "width" => (int)$image["original"]["width"],
+ "height" => (int)$image["original"]["height"]
+ ],
+ [
+ "url" => $image["thumbnail"]["url"],
+ "width" => (int)$image["thumbnail"]["width"],
+ "height" => (int)$image["thumbnail"]["height"]
+ ]
+ ],
+ "url" => $image["refererUrl"]
+ ];
+ }
+
+ // get next page
+ if(
+ $json["props"]["initialProps"]["pageProps"]["algoAttribute"]["resultsIsLast"] === false &&
+ isset($json["props"]["initialProps"]["pageProps"]["crumb"]["crumbValue"])
+ ){
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode([
+ "params" => [
+ "p" => $get["s"],
+ "ei" => "UTF-8",
+ "n" => 20, // number of results
+ "b" => 1, // increment (+20 on every page)
+ "vm" => "i",
+ "cr" => $json["props"]["initialProps"]["pageProps"]["crumb"]["crumbValue"],
+ "se" => 0,
+ "ue" => 0
+ ],
+ "cookies" => $html["cookies"]
+ ]),
+ "images",
+ $proxy
+ );
+ }
+ }
+
+ return $out;
+ }
+
+ public function video($get){
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "video" => [],
+ "author" => [],
+ "livestream" => [],
+ "playlist" => [],
+ "reel" => []
+ ];
+
+ if($get["npt"]){
+
+ // parse JSON endpoint
+ // https://search.yahoo.co.jp/image/api/search
+ // ?p=minecraft
+ // &ei=UTF-8
+ // &n=20
+ // &b=41
+ // &vm=i
+ // &cr=AiXVLGkAPLAUueqkG0dtUP6lo_3suz4Qsrv2QjabeXt4sk1wT8irS3LLvkSPRm-u7T1wvkE1ucQvhzYuB2QtDkjswUogRjoQVx_p73BaN3P1klQUFsnIPdgAttusXE0ii0pOcYCT
+ // &se=0
+ // &ue=0
+ [$params, $proxy] = $this->backend->get($get["npt"], "images");
+
+ $params = json_decode($params, true);
+
+ // increment
+ $params["params"]["b"] += 20;
+
+ try{
+ $json = $this->get(
+ $proxy,
+ "https://search.yahoo.co.jp/video/api/search",
+ $params["params"],
+ false,
+ true,
+ $params["cookies"]
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch JSON");
+ }
+
+ $json = json_decode($json, true);
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ if(isset($json["Error"]["Message"])){
+
+ throw new Exception("API returned an error: {$json["Error"]["Message"]}");
+ }
+
+ foreach($json["algos"] as $video){
+
+ if(isset($video["uploadDate"])){
+
+ $date = strtotime($video["uploadDate"]);
+ if($date === false){
+
+ $date = null;
+ }
+
+ }else{
+
+ $date = null;
+ }
+
+ if(isset($video["thumbnail"]["url"])){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" => $video["thumbnail"]["url"]
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["video"][] = [
+ "title" => $this->titledots($video["title"]),
+ "description" =>
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $video["summary"]
+ )
+ )
+ ),
+ "author" => [
+ "name" =>
+ (
+ isset($video["uploader"]) &&
+ $video["uploader"] != ""
+ ) ?
+ $video["uploader"] : null,
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => $date,
+ "duration" =>
+ (
+ isset($video["duration"]) &&
+ $video["duration"] != ""
+ ) ?
+ $this->hms2int($video["duration"]) : null,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" => $video["refererUrl"]
+ ];
+ }
+
+ // detect next page
+ if($json["algoAttribute"]["isLast"] === false){
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode($params),
+ "images",
+ $proxy
+ );
+ }
+
+ }else{
+
+ // parse initial page
+ $params = [
+ "p" => $get["s"],
+ "ei" => "UTF-8"
+ ];
+
+ $proxy = $this->backend->get_ip();
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://search.yahoo.co.jp/video/search",
+ $params,
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ //$html = file_get_contents("scraper/yahoo_japan.html");
+
+ $json_object =
+ explode(
+ '<script id="__NEXT_DATA__" type="application/json">',
+ $html["body"]
+ );
+
+ if(count($json_object) !== 2){
+
+ throw new Exception("Failed to find JSON script");
+ }
+
+ $json =
+ json_decode(
+ $this->fuckhtml
+ ->extract_json(
+ $json_object[1]
+ ),
+ true
+ );
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ if(!isset($json["props"]["initialProps"]["pageProps"]["algos"])){
+
+ throw new Exception("Failed to access algos object");
+ }
+
+ foreach($json["props"]["initialProps"]["pageProps"]["algos"] as $video){
+
+ if(isset($video["uploadDate"])){
+
+ $date = strtotime($video["uploadDate"]);
+ if($date === false){
+
+ $date = null;
+ }
+
+ }else{
+
+ $date = null;
+ }
+
+ if(isset($video["thumbnail"]["url"])){
+
+ $thumb = [
+ "ratio" => "16:9",
+ "url" => $video["thumbnail"]["url"]
+ ];
+ }else{
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+ }
+
+ $out["video"][] = [
+ "title" => $this->titledots($video["title"]),
+ "description" =>
+ $this->titledots(
+ html_entity_decode(
+ $this->fuckhtml
+ ->getTextContent(
+ $video["summary"]
+ )
+ )
+ ),
+ "author" => [
+ "name" =>
+ (
+ isset($video["uploader"]) &&
+ $video["uploader"] != ""
+ ) ?
+ $video["uploader"] : null,
+ "url" => null,
+ "avatar" => null
+ ],
+ "date" => $date,
+ "duration" =>
+ (
+ isset($video["duration"]) &&
+ $video["duration"] != ""
+ ) ?
+ $this->hms2int($video["duration"]) : null,
+ "views" => null,
+ "thumb" => $thumb,
+ "url" => $video["refererUrl"]
+ ];
+ }
+
+ // get next page
+ if(
+ $json["props"]["initialProps"]["pageProps"]["algoAttribute"]["isLast"] === false &&
+ isset($json["props"]["initialProps"]["pageProps"]["crumb"]["crumbValue"])
+ ){
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode([
+ "params" => [
+ "n" => 20, // number of results
+ "b" => 1, // increment (+20)
+ "vm" => "i",
+ "cr" => $json["props"]["initialProps"]["pageProps"]["crumb"]["crumbValue"],
+ "p" => $get["s"],
+ "pd" => "",
+ "dr" => "",
+ "hq" => "",
+ "st" => "",
+ "qrw" => "",
+ "ei" => "UTF-8",
+ "ue" => "0",
+ "se" => "0"
+ ],
+ "cookies" => $html["cookies"]
+ ]),
+ "images",
+ $proxy
+ );
+ }
+ }
+
+ return $out;
+ }
+
+ public function news($get){
+
+ if($get["npt"]){
+
+ [$params, $proxy] = $this->backend->get($get["npt"], "news");
+ $params = json_decode($params, true);
+
+ }else{
+
+ $search = $get["s"];
+ if(strlen($search) === 0){
+
+ throw new Exception("Search term is empty!");
+ }
+
+ $proxy = $this->backend->get_ip();
+
+ $params = [
+ "p" => $get["s"],
+ "ei" => "UTF-8"
+ ];
+ }
+
+ try{
+ $html = $this->get(
+ $proxy,
+ "https://chiebukuro.yahoo.co.jp/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
+ //$html = file_get_contents("scraper/yahoo_japan.html");
+
+ $out = [
+ "status" => "ok",
+ "npt" => null,
+ "news" => []
+ ];
+
+ $json_object =
+ explode(
+ 'window.PROPS = ',
+ $html
+ );
+
+ if(count($json_object) !== 2){
+
+ throw new Exception("Failed to find JSON script");
+ }
+
+ $json =
+ json_decode(
+ $this->fuckhtml
+ ->extract_json(
+ $json_object[1]
+ ),
+ true
+ );
+
+ if($json === null){
+
+ throw new Exception("Failed to decode JSON");
+ }
+
+ if(!isset($json["listSearchResults"]["listContents"])){
+
+ throw new Exception("Yahoo! did not return a listContents object");
+ }
+
+ foreach($json["listSearchResults"]["listContents"] as $news){
+
+ $date = strtotime($news["datePosted"]);
+
+ if($date === false){
+
+ $date = null;
+ }
+
+ $thumb = [
+ "ratio" => null,
+ "url" => null
+ ];
+
+ $out["news"][] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $news["heading"]
+ )
+ ),
+ "author" => null,
+ "description" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $news["summary"]
+ )
+ ),
+ "date" => $date,
+ "thumb" => $thumb,
+ "url" => $news["url"]
+ ];
+ }
+
+ // get next page
+ if($json["pagination"]["currentPage"] != $json["pagination"]["totalNumberOfPages"]){
+
+ if(!isset($params["b"])){
+
+ $params["b"] = 1;
+ }
+
+ $params["b"] += 10;
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode($params),
+ "news",
+ $proxy
+ );
+ }
+
+ return $out;
+ }
+
+ private function hms2int($time){
+
+ $parts = explode(":", $time, 3);
+ $time = 0;
+
+ if(count($parts) === 3){
+
+ // hours
+ $time = $time + ((int)$parts[0] * 3600);
+ array_shift($parts);
+ }
+
+ if(count($parts) === 2){
+
+ // minutes
+ $time = $time + ((int)$parts[0] * 60);
+ array_shift($parts);
+ }
+
+ // seconds
+ $time = $time + (int)$parts[0];
+
+ return $time;
+ }
+
+ private function titledots($title){
+
+ $substr = substr($title, -3);
+
+ if(
+ $substr == "..." ||
+ $substr == "…"
+ ){
+
+ return trim(substr($title, 0, -3));
+ }
+
+ return trim($title);
+ }
+
+ private function yahooratio($width, $height){
+
+ $ratio = [
+ 144 / $width,
+ 256 / $height
+ ];
+
+ if($ratio[0] < $ratio[1]){
+
+ $ratio = $ratio[0];
+ }else{
+
+ $ratio = $ratio[1];
+ }
+
+ return [
+ floor($width * $ratio),
+ floor($height * $ratio)
+ ];
+ }
+}
diff --git a/settings.php b/settings.php
index 79cf947..afb1a36 100644
--- a/settings.php
+++ b/settings.php
@@ -142,6 +142,10 @@ $settings = [
"text" => "Google CSE"
],
[
+ "value" => "yahoo_japan",
+ "text" => "Yahoo! JAPAN",
+ ],
+ [
"value" => "startpage",
"text" => "Startpage"
],
@@ -224,6 +228,10 @@ $settings = [
"text" => "Google CSE"
],
[
+ "value" => "yahoo_japan",
+ "text" => "Yahoo! JAPAN",
+ ],
+ [
"value" => "startpage",
"text" => "Startpage"
],
@@ -306,6 +314,10 @@ $settings = [
"text" => "Google"
],
[
+ "value" => "yahoo_japan",
+ "text" => "Yahoo! JAPAN",
+ ],
+ [
"value" => "startpage",
"text" => "Startpage"
],
@@ -344,6 +356,10 @@ $settings = [
"text" => "Google"
],
[
+ "value" => "yahoo_japan",
+ "text" => "Yahoo! JAPAN",
+ ],
+ [
"value" => "startpage",
"text" => "Startpage"
],
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage