aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2026-05-09 22:24:36 -0400
committerlolcat <will@lolcat.ca>2026-05-09 22:24:36 -0400
commitc45f8b1e128ebb834c579f7b03f2f77811bf4ca5 (patch)
tree88df997003d16a202684abcb6fffff7c9eb195c2
parent6086c63148f7f859da29280377861166a7eab387 (diff)
fix startpage pagination and fuckups with word corrections
-rw-r--r--scraper/startpage.php338
1 files changed, 17 insertions, 321 deletions
diff --git a/scraper/startpage.php b/scraper/startpage.php
index e48a429..a555dc5 100644
--- a/scraper/startpage.php
+++ b/scraper/startpage.php
@@ -564,12 +564,16 @@ class startpage{
break;
case "spellsuggest-google":
- $out["spelling"] =
- [
- "type" => "including",
- "using" => $json["render"]["query"],
- "correction" => $category["results"][0]["query"]
- ];
+
+ if(isset($category["results"][0]["query"])){
+
+ $out["spelling"] =
+ [
+ "type" => "including",
+ "using" => $json["render"]["query"],
+ "correction" => urldecode($category["results"][0]["query"])
+ ];
+ }
break;
case "dictionary-qi":
@@ -645,318 +649,6 @@ class startpage{
}
}
- // parse instant answers
- if(
- $get["extendedsearch"] == "yes" &&
- $get_instant_answer === true
- ){
-
- // https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=BqZ3inqrAgF701&sr=1
- try{
- $post = [
- "se" => "n0vze2y9dqwy",
- "q" => $json["render"]["query"],
- "results" => [], // populate
- "enableKnowledgePanel" => true,
- "enableMediaThumbBar" => false,
- "enableSearchSuggestions" => false,
- "enableTripadvisorProperties" => [],
- "enableTripadvisorPlaces" => [],
- "enableTripadvisorPlacesForLocations" => [],
- "enableWebProducts" => false,
- "tripadvisorPartnerId" => null,
- "tripadvisorMapColorMode" => "light",
- "tripadvisorDisablesKnowledgePanel" => false,
- "instantAnswers" => [
- "smartAnswers",
- "youtube",
- "tripadvisor"
- ],
- "iaType" => null,
- "forceEnhancedKnowledgePanel" => false,
- "shoppingOnly" => false,
- "allowAdultProducts" => true,
- "lang" => "en",
- "browserLang" => "en-US",
- "browserTimezone" => "America/New_York",
- "market" => null,
- "userLocation" => null,
- "userDate" => date("Y-m-d"),
- "userAgentType" => "unknown"
- ];
-
- foreach($out["web"] as $result){
-
- $post["results"][] = [
- "url" => $result["url"],
- "title" => $result["title"]
- ];
- }
-
- $post = json_encode($post, JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES | JSON_INVALID_UTF8_IGNORE);
-
- $additional_data =
- $this->get(
- $proxy,
- "https://www.startpage.com/sp/qi?qimsn=ex&sxap=%2Fv1%2Fquery&sc=" . $json["render"]["callback_sc"] . "&sr=1",
- $post,
- true,
- true
- );
-
- $additional_data = json_decode($additional_data, true);
-
- if($additional_data === null){
-
- throw new Exception("Failed to decode JSON"); // just break out, dont fail completely
- }
-
- if(!isset($additional_data["knowledgePanel"])){
-
- throw new Exception("Response has missing data (knowledgePanel)");
- }
-
- $additional_data = $additional_data["knowledgePanel"];
-
- $answer = [
- "title" => $additional_data["meta"]["title"],
- "description" => [
- [
- "type" => "quote",
- "value" => $additional_data["meta"]["description"]
- ]
- ],
- "url" => $additional_data["meta"]["origWikiUrl"],
- "thumb" => $additional_data["meta"]["image"],
- "table" => [],
- "sublink" => []
- ];
-
- // parse html for instant answer
- $this->fuckhtml->load($additional_data["html"]);
-
- $div =
- $this->fuckhtml
- ->getElementsByTagName(
- "div"
- );
-
- // get description
- $description =
- $this->fuckhtml
- ->getElementsByClassName(
- "sx-kp-short-extract sx-kp-short-extract-complete",
- $div
- );
-
- if(count($description) !== 0){
-
- $answer["description"][] = [
- "type" => "text",
- "value" =>
- html_entity_decode(
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- )
- )
- ];
- }
-
- // get socials
- $socials =
- $this->fuckhtml
- ->getElementsByClassName(
- "sx-wiki-social-link",
- "a"
- );
-
- foreach($socials as $social){
-
- $title =
- $this->fuckhtml
- ->getTextContent(
- $social["attributes"]["title"]
- );
-
- $url =
- $this->fuckhtml
- ->getTextContent(
- $social["attributes"]["href"]
- );
-
- switch($title){
-
- case "Official Website":
- $title = "Website";
- break;
- }
-
- $answer["sublink"][$title] = $url;
- }
-
- // get videos
- $videos =
- $this->fuckhtml
- ->getElementsByClassName(
- "sx-kp-video-grid-item",
- $div
- );
-
- foreach($videos as $video){
-
- $this->fuckhtml->load($video);
-
- $as =
- $this->fuckhtml
- ->getElementsByTagName(
- "a"
- );
-
- if(count($as) === 0){
-
- // ?? invalid
- continue;
- }
-
- $image =
- $this->fuckhtml
- ->getElementsByAttributeName(
- "data-sx-src",
- "img"
- );
-
- if(count($image) !== 0){
-
- $thumb = [
- "ratio" => "16:9",
- "url" =>
- $this->fuckhtml
- ->getTextContent(
- $image[0]["attributes"]["data-sx-src"]
- )
- ];
- }else{
-
- $thumb = [
- "ratio" => null,
- "url" => null
- ];
- }
-
- $out["video"][] = [
- "title" =>
- $this->fuckhtml
- ->getTextContent(
- $as[0]["attributes"]["title"]
- ),
- "description" => null,
- "date" => null,
- "duration" => null,
- "views" => null,
- "thumb" => $thumb,
- "url" =>
- $this->fuckhtml
- ->getTextContent(
- $as[0]["attributes"]["href"]
- )
- ];
- }
-
- // reset
- $this->fuckhtml->load($additional_data["html"]);
-
- // get table elements
- $table =
- $this->fuckhtml
- ->getElementsByClassName(
- "sx-infobox",
- "table"
- );
-
- if(count($table) !== 0){
-
- $trs =
- $this->fuckhtml
- ->getElementsByTagName(
- "tr"
- );
-
- foreach($trs as $tr){
-
- $this->fuckhtml->load($tr);
-
- // ok so startpage devs cant fucking code a table
- // td = content
- // th (AAAHH) = title
- $tds =
- $this->fuckhtml
- ->getElementsByTagName(
- "td"
- );
-
- $ths =
- $this->fuckhtml
- ->getElementsByTagName(
- "th"
- );
-
- if(
- count($ths) === 1 &&
- count($tds) === 1
- ){
-
- $title =
- $this->fuckhtml
- ->getTextContent(
- $ths[0]
- );
-
- $description = [];
-
- $this->fuckhtml->load($tds[0]);
-
- $lis =
- $this->fuckhtml
- ->getElementsByTagName(
- "li"
- );
-
- if(count($lis) !== 0){
-
- foreach($lis as $li){
-
- $description[] =
- $this->fuckhtml
- ->getTextContent(
- $li
- );
- }
-
- $description = implode(", ", $description);
- }else{
-
- $description =
- $this->fuckhtml
- ->getTextContent(
- $tds[0]
- );
- }
-
- $answer["table"][$title] = $description;
- }
- }
- }
-
- $out["answer"][] = $answer;
-
- }catch(Exception $error){
-
- // do nothing
- //echo "error!";
- }
- }
-
return $out;
}
@@ -1428,12 +1120,16 @@ class startpage{
[
"lui" => "english",
"language" => "english",
- "query" => $str["q"],
- "cat" => $pagetype,
"sc" => $str["sc"],
"t" => "device",
+ "cat" => $pagetype,
"segment" => "startpage.udog",
- "page" => $str["page"]
+ "abd" => 0,
+ "abe" => 0,
+ "query" => $str["q"],
+ "page" => $str["page"],
+ "qsr" => "all",
+ "qadf" => "none" // @ todo fix (??)
]
),
$pagetype,
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage