aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2026-04-26 16:22:24 -0400
committerlolcat <will@lolcat.ca>2026-04-26 16:22:24 -0400
commit9ea0372bb751fe403f17566c4a4a65cd073997ec (patch)
treec373d2f788496fb9508c32c386871ab09f168aa7
parenta54f212550b158a820cd9ae0f65d59ad4dd3b173 (diff)
remove extremely low quality scrapers
-rw-r--r--lib/frontend.php5
-rw-r--r--scraper/crowdview.php145
-rw-r--r--scraper/curlie.php309
-rw-r--r--scraper/greppr.php452
4 files changed, 1 insertions, 910 deletions
diff --git a/lib/frontend.php b/lib/frontend.php
index 13bbfab..8f2f20b 100644
--- a/lib/frontend.php
+++ b/lib/frontend.php
@@ -646,16 +646,13 @@ class frontend{
"qwant" => "Qwant",
"ghostery" => "Ghostery",
"yep" => "Yep",
- "greppr" => "Greppr",
- "crowdview" => "Crowdview",
"mwmbl" => "Mwmbl",
"mojeek" => "Mojeek",
"baidu" => "Baidu",
"coccoc" => "Cốc Cốc",
"solofield" => "Solofield",
"marginalia" => "Marginalia",
- "wiby" => "wiby",
- "curlie" => "Curlie"
+ "wiby" => "wiby"
]
];
break;
diff --git a/scraper/crowdview.php b/scraper/crowdview.php
deleted file mode 100644
index 8fb267b..0000000
--- a/scraper/crowdview.php
+++ /dev/null
@@ -1,145 +0,0 @@
-<?php
-
-class crowdview{
-
- public function __construct(){
-
- include "lib/backend.php";
- $this->backend = new backend("crowdview");
-
- include "lib/fuckhtml.php";
- $this->fuckhtml = new fuckhtml();
- }
-
- public function getfilters($page){
-
- return [];
- }
-
- private function get($proxy, $url, $get = []){
-
- $curlproc = curl_init();
-
- if($get !== []){
- $get = http_build_query($get);
- $url .= "?" . $get;
- }
-
- curl_setopt($curlproc, CURLOPT_URL, $url);
-
- curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
- curl_setopt($curlproc, CURLOPT_HTTPHEADER,
- ["User-Agent: " . config::USER_AGENT,
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
- "Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip",
- "DNT: 1",
- "Connection: keep-alive",
- "Upgrade-Insecure-Requests: 1",
- "Sec-Fetch-Dest: document",
- "Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: none",
- "Sec-Fetch-User: ?1"]
- );
-
- curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
- curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
- curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
- curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
-
- $this->backend->assign_proxy($curlproc, $proxy);
-
- $data = curl_exec($curlproc);
-
- if(curl_errno($curlproc)){
-
- throw new Exception(curl_error($curlproc));
- }
-
- curl_close($curlproc);
- return $data;
- }
-
- public function web($get){
-
- $search = $get["s"];
- if(strlen($search) === 0){
-
- throw new Exception("Search term is empty!");
- }
-
- $proxy = $this->backend->get_ip();
-
- try{
- $json = $this->get(
- $proxy,
- "https://crowdview-next-js.onrender.com/api/search-v3",
- [
- "query" => $search
- ]
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to fetch JSON");
- }
-
- $out = [
- "status" => "ok",
- "spelling" => [
- "type" => "no_correction",
- "using" => null,
- "correction" => null
- ],
- "npt" => null,
- "answer" => [],
- "web" => [],
- "image" => [],
- "video" => [],
- "news" => [],
- "related" => []
- ];
-
- $json = json_decode($json, true);
-
- if($json === NULL){
-
- throw new Exception("Failed to decode JSON");
- }
-
- foreach($json["results"] as $item){
-
- $description = explode("<b>", $item["snippet"], 2);
-
- $out["web"][] = [
- "title" => $this->sanitize($item["title"]),
- "description" => $this->sanitize($description[1]),
- "url" => $item["link"],
- "date" => strtotime($description[0]),
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => [],
- "table" => []
- ];
- }
-
- return $out;
- }
-
- private function sanitize($html){
-
- return
- trim(
- $this->fuckhtml
- ->getTextContent(
- html_entity_decode(
- $html
- )
- ),
- ". "
- );
- }
-}
diff --git a/scraper/curlie.php b/scraper/curlie.php
deleted file mode 100644
index 61a8eb2..0000000
--- a/scraper/curlie.php
+++ /dev/null
@@ -1,309 +0,0 @@
-<?php
-
-class curlie{
-
- public function __construct(){
-
- include "lib/backend.php";
- $this->backend = new backend("curlie");
-
- include "lib/fuckhtml.php";
- $this->fuckhtml = new fuckhtml();
- }
-
- public function getfilters($page){
-
- if($page != "web"){
-
- return [];
- }
-
- return [
- "lang" => [
- "display" => "Language",
- "option" => [
- "any" => "Any language",
- "en" => "English",
- "de" => "German",
- "fr" => "French",
- "ja" => "Japanese",
- "it" => "Italian",
- "es" => "Spanish",
- "ru" => "Russian",
- "nl" => "Dutch",
- "pl" => "Polish",
- "tr" => "Turkish",
- "da" => "Danish",
- "sv" => "Swedish",
- "no" => "Norwegian",
- "is" => "Icelandic",
- "fo" => "Faroese",
- "fi" => "Finnish",
- "et" => "Estonian",
- "lt" => "Lithuanian",
- "lv" => "Latvian",
- "cy" => "Welsh",
- "ga" => "Irish",
- "gd" => "Scottish Gaelic",
- "br" => "Breton",
- "fy" => "Frisian",
- "frr" => "North Frisian",
- "gem" => "Saterland Frisian",
- "lb" => "Luxembourgish",
- "rm" => "Romansh",
- "pt" => "Portuguese",
- "ca" => "Catalan",
- "gl" => "Galician",
- "eu" => "Basque",
- "ast" => "Asturian",
- "an" => "Aragonese",
- "fur" => "Friulan",
- "sc" => "Sardinian",
- "scn" => "Sicilian",
- "oc" => "Occitan",
- "be" => "Belarusian",
- "cs" => "Czech",
- "hu" => "Hungarian",
- "sk" => "Slovak",
- "uk" => "Ukrainian",
- "csb" => "Kashubian",
- "tt" => "Tatar",
- "ba" => "Bashkir",
- "os" => "Ossetian",
- "sl" => "Slovene",
- "sr" => "Serbian",
- "hr" => "Croatian",
- "bs" => "Bosnian",
- "bg" => "Bulgarian",
- "sq" => "Albanian",
- "ro" => "Romanian",
- "mk" => "Macedonian",
- "el" => "Greek",
- "iw" => "Hebrew",
- "fa" => "Persian",
- "ar" => "Arabic",
- "ku" => "Kurdish",
- "az" => "Azerbaijani",
- "hy" => "Armenian",
- "af" => "Afrikaans",
- "sw" => "Kiswahili",
- "uz" => "Uzbek",
- "kk" => "Kazakh",
- "ky" => "Kyrgyz",
- "tg" => "Tajik",
- "tk" => "Turkmen",
- "ug" => "Uyghurche",
- "hi" => "Hindi",
- "si" => "Sinhalese",
- "gu" => "Gujarati",
- "ur" => "Urdu",
- "mr" => "Marathi",
- "pa" => "Punjabi",
- "bn" => "Bengali",
- "ta" => "Tamil",
- "te" => "Telugu",
- "kn" => "Kannada",
- "zh_CN" => "Chinese Simplified",
- "zh_TW" => "Chinese Traditional",
- "ko" => "Korean",
- "cfr" => "Taiwanese",
- "th" => "Thai",
- "vi" => "Vietnamese",
- "in" => "Indonesian",
- "ms" => "Malay",
- "tl" => "Tagalog",
- "eo" => "Esperanto",
- "ia" => "Interlingua",
- "la" => "Latin"
- ]
- ]
- ];
- }
-
- private function get($proxy, $url, $get = []){
-
- $curlproc = curl_init();
-
- if($get !== []){
- $get = http_build_query($get);
- $url .= "?" . $get;
- }
-
- curl_setopt($curlproc, CURLOPT_URL, $url);
-
- curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
- curl_setopt($curlproc, CURLOPT_HTTPHEADER,
- ["User-Agent: " . config::USER_AGENT,
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
- "Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip",
- "DNT: 1",
- "Connection: keep-alive",
- "Upgrade-Insecure-Requests: 1",
- "Sec-Fetch-Dest: document",
- "Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: none",
- "Sec-Fetch-User: ?1"]
- );
-
- curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
- curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
- curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
- curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
-
- $this->backend->assign_proxy($curlproc, $proxy);
-
- $data = curl_exec($curlproc);
-
- if(curl_errno($curlproc)){
-
- throw new Exception(curl_error($curlproc));
- }
-
- curl_close($curlproc);
- return $data;
- }
-
- public function web($get){
-
- if($get["npt"]){
-
- [$query, $proxy] = $this->backend->get($get["npt"], "web");
-
- try{
- $html = $this->get(
- $proxy,
- "https://curlie.org/" . $query,
- []
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to fetch search page");
- }
-
- }else{
- $proxy = $this->backend->get_ip();
-
- $query = [
- "q" => $get["s"],
- "start" => 0,
- "stime" => 92452189 // ?
- ];
-
- if($get["lang"] !== "any"){
-
- $query["lang"] = $get["lang"];
- }
-
- try{
- $html = $this->get(
- $proxy,
- "https://curlie.org/search",
- $query
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to fetch search page");
- }
- }
-
- $this->fuckhtml->load($html);
-
- $nextpage =
- $this->fuckhtml
- ->getElementsByClassName(
- "next-page",
- "a"
- );
-
- if(count($nextpage) !== 0){
-
- $nextpage =
- $this->backend->store(
- $nextpage[0]["attributes"]["href"],
- "web",
- $proxy
- );
- }else{
-
- $nextpage = null;
- }
-
- $out = [
- "status" => "ok",
- "spelling" => [
- "type" => "no_correction",
- "using" => null,
- "correction" => null
- ],
- "npt" => $nextpage,
- "answer" => [],
- "web" => [],
- "image" => [],
- "video" => [],
- "news" => [],
- "related" => []
- ];
-
- $items =
- $this->fuckhtml
- ->getElementsByClassName(
- "site-item",
- "div"
- );
-
- foreach($items as $item){
-
- $this->fuckhtml->load($item);
-
- $a =
- $this->fuckhtml
- ->getElementsByAttributeValue(
- "target",
- "_blank",
- "a"
- )[0];
-
- $description =
- $this->fuckhtml
- ->getElementsByClassName("site-descr");
-
- if(count($description) !== 0){
-
- $description =
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- );
- }else{
-
- $description = null;
- }
-
- $out["web"][] = [
- "title" =>
- $this->fuckhtml
- ->getTextContent(
- $a
- ),
- "description" => $description,
- "url" =>
- $this->fuckhtml
- ->getTextContent(
- $a["attributes"]["href"]
- ),
- "date" => null,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => [],
- "table" => []
- ];
- }
-
- return $out;
- }
-}
diff --git a/scraper/greppr.php b/scraper/greppr.php
deleted file mode 100644
index 3d8b517..0000000
--- a/scraper/greppr.php
+++ /dev/null
@@ -1,452 +0,0 @@
-<?php
-// greppr dev probably monitors 4get code, lol
-// hello greppr dude, add an API you moron
-
-class greppr{
-
- public function __construct(){
-
- include "lib/backend.php";
- $this->backend = new backend("greppr");
-
- include "lib/fuckhtml.php";
- $this->fuckhtml = new fuckhtml();
- }
-
- public function getfilters($page){
-
- return [];
- }
-
- private function get($proxy, $url, $get = [], $cookies = [], $post = false){
-
- $curlproc = curl_init();
-
- curl_setopt($curlproc, CURLOPT_URL, $url);
-
- curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
-
- $cookie = [];
- foreach($cookies as $k => $v){
-
- $cookie[] = "{$k}={$v}";
- }
-
- $cookie = implode("; ", $cookie);
-
- if($post === false){
-
- if($get !== []){
- $get = http_build_query($get);
- $url .= "?" . $get;
- }
-
- if($cookie == ""){
-
- curl_setopt($curlproc, CURLOPT_HTTPHEADER,
- ["User-Agent: " . config::USER_AGENT,
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
- "Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip",
- "DNT: 1",
- "Connection: keep-alive",
- "Upgrade-Insecure-Requests: 1",
- "Sec-Fetch-Dest: document",
- "Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: none",
- "Sec-Fetch-User: ?1"]
- );
- }else{
-
- curl_setopt($curlproc, CURLOPT_HTTPHEADER,
- ["User-Agent: " . config::USER_AGENT,
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip, deflate, br, zstd",
- "DNT: 1",
- "Sec-GPC: 1",
- "Connection: keep-alive",
- "Referer: https://greppr.org/search",
- "Cookie: {$cookie}",
- "Upgrade-Insecure-Requests: 1",
- "Sec-Fetch-Dest: document",
- "Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: same-origin",
- "Sec-Fetch-User: ?1",
- "Priority: u=0, i"]
- );
- }
- }else{
-
- $get = http_build_query($get);
-
- curl_setopt($curlproc, CURLOPT_POST, true);
- curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
-
- curl_setopt($curlproc, CURLOPT_HTTPHEADER,
- ["User-Agent: " . config::USER_AGENT,
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
- "Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip, deflate, br, zstd",
- "Content-Type: application/x-www-form-urlencoded",
- "Content-Length: " . strlen($get),
- "Origin: https://greppr.org",
- "DNT: 1",
- "Sec-GPC: 1",
- "Connection: keep-alive",
- "Referer: https://greppr.org/",
- "Cookie: {$cookie}",
- "Upgrade-Insecure-Requests: 1",
- "Sec-Fetch-Dest: document",
- "Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: same-origin",
- "Sec-Fetch-User: ?1",
- "Priority: u=0, i"]
- );
- }
-
- curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
- curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
- curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
- curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
-
- $this->backend->assign_proxy($curlproc, $proxy);
-
- $headers = [];
-
- curl_setopt(
- $curlproc,
- CURLOPT_HEADERFUNCTION,
- function($curlproc, $header) use (&$headers){
-
- $len = strlen($header);
- $header = explode(':', $header, 2);
-
- if(count($header) < 2){
-
- // ignore invalid headers
- return $len;
- }
-
- $headers[strtolower(trim($header[0]))][] = trim($header[1]);
-
- return $len;
- }
- );
-
- $data = curl_exec($curlproc);
-
- if(curl_errno($curlproc)){
-
- throw new Exception(curl_error($curlproc));
- }
-
- curl_close($curlproc);
-
- return [
- "headers" => $headers,
- "data" => $data
- ];
- }
-
- public function web($get, $first_attempt = true){
-
- if($get["npt"]){
-
- [$q, $proxy] = $this->backend->get($get["npt"], "web");
-
- $tokens = json_decode($q, true);
-
- //
- // Get paginated page
- //
- try{
-
- $html = $this->get(
- $proxy,
- "https://greppr.org" . $tokens["get"],
- [],
- $tokens["cookies"],
- false
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to fetch search page");
- }
-
- }else{
-
- $search = $get["s"];
- if(strlen($search) === 0){
-
- throw new Exception("Search term is empty!");
- }
-
- $proxy = $this->backend->get_ip();
-
- //
- // get token
- //
- try{
-
- $html =
- $this->get(
- $proxy,
- "https://greppr.org",
- [],
- [],
- false
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to fetch homepage");
- }
-
- //
- // Parse token
- //
- $this->fuckhtml->load($html["data"]);
-
- $tokens = [
- "req" => null,
- "data" => null,
- "cookies" => null
- ];
-
- $inputs =
- $this->fuckhtml
- ->getElementsByTagName(
- "input"
- );
-
- foreach($inputs as $input){
-
- if(!isset($input["attributes"]["name"])){
-
- continue;
- }
-
- if(
- isset($input["attributes"]["value"]) &&
- !empty($input["attributes"]["value"])
- ){
-
- $tokens
- ["data"]
- [$this->fuckhtml
- ->getTextContent(
- $input["attributes"]["name"]
- )] =
- $this->fuckhtml
- ->getTextContent(
- $input["attributes"]["value"]
- );
- }else{
-
- $tokens["req"] =
- $this->fuckhtml
- ->getTextContent(
- $input["attributes"]["name"]
- );
- }
- }
-
- if($tokens["req"] === null){
-
- throw new Exception("Failed to get request ID");
- }
-
- if(isset($html["headers"]["set-cookie"])){
-
- foreach($html["headers"]["set-cookie"] as $cookie){
-
- if(
- preg_match(
- '/([^=]+)=([^;]+)/',
- $cookie,
- $matches
- )
- ){
-
- $tokens["cookies"][$matches[1]] = $matches[2];
- }
- }
- }
-
- //
- // Get initial search page
- //
- $tokens_req = $tokens["data"];
- $tokens_req[$tokens["req"]] = $search;
-
- try{
-
- $html = $this->get(
- $proxy,
- "https://greppr.org/search",
- $tokens_req,
- $tokens["cookies"],
- true
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to fetch search page");
- }
- }
-
- //$html = file_get_contents("scraper/greppr.html");
- //$this->fuckhtml->load($html);
- $this->fuckhtml->load($html["data"]);
-
- $out = [
- "status" => "ok",
- "spelling" => [
- "type" => "no_correction",
- "using" => null,
- "correction" => null
- ],
- "npt" => null,
- "answer" => [],
- "web" => [],
- "image" => [],
- "video" => [],
- "news" => [],
- "related" => []
- ];
-
- // get results for later
- $results =
- $this->fuckhtml
- ->getElementsByClassName(
- "result",
- "div"
- );
-
- // check for next page
- $next_elem =
- $this->fuckhtml
- ->getElementsByClassName(
- "pagination",
- "ul"
- );
-
- if(count($next_elem) !== 0){
-
- $this->fuckhtml->load($next_elem[0]);
-
- $as =
- $this->fuckhtml
- ->getElementsByClassName(
- "page-link",
- "a"
- );
-
- $break = false;
- foreach($as as $a){
-
- if($break === true){
-
- $out["npt"] =
- $this->backend->store(
- json_encode([
- "get" =>
- $this->fuckhtml
- ->getTextContent(
- $a["attributes"]["href"]
- ),
- "cookies" => $tokens["cookies"]
- ]),
- "web",
- $proxy
- );
- break;
- }
-
- if($a["attributes"]["href"] == "#"){
-
- $break = true;
- }
- }
- }
-
- // scrape results
- foreach($results as $result){
-
- $this->fuckhtml->load($result);
-
- $a =
- $this->fuckhtml
- ->getElementsByTagName(
- "a"
- )[0];
-
- $description =
- $this->fuckhtml
- ->getElementsByClassName(
- "highlightedDesc",
- "p"
- );
-
- if(count($description) === 0){
-
- $description = null;
- }else{
-
- $description =
- $this->limitstrlen(
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- )
- );
- }
-
- $date =
- $this->fuckhtml
- ->getElementsByTagName(
- "p"
- );
-
- $date =
- strtotime(
- explode(
- ":",
- $this->fuckhtml
- ->getTextContent(
- $date[count($date) - 1]["innerHTML"]
- )
- )[1]
- );
-
- $out["web"][] = [
- "title" =>
- $this->fuckhtml
- ->getTextContent(
- $a["innerHTML"]
- ),
- "description" => $description,
- "url" =>
- $this->fuckhtml
- ->getTextContent(
- $a["attributes"]["href"]
- ),
- "date" => $date,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => [],
- "table" => []
- ];
- }
-
- return $out;
- }
-
- private function limitstrlen($text){
-
- return explode("\n", wordwrap($text, 300, "\n"))[0];
- }
-}
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage