From 1a5a653be348b3e3823c7c5620cdc1f7cfc87072 Mon Sep 17 00:00:00 2001 From: lolcat Date: Sun, 1 Sep 2024 10:52:28 -0400 Subject: added ghostery search --- scraper/ghostery.html | 2714 +++++++++++++++++++++++++++++++++++++++++++++++++ scraper/ghostery.php | 308 ++++++ 2 files changed, 3022 insertions(+) create mode 100644 scraper/ghostery.html create mode 100644 scraper/ghostery.php (limited to 'scraper') diff --git a/scraper/ghostery.html b/scraper/ghostery.html new file mode 100644 index 0000000..cc912b4 --- /dev/null +++ b/scraper/ghostery.html @@ -0,0 +1,2714 @@ + + + + + + + + + + + + 4chan - Ghostery Private Search + + + + + + + + + + + + + + + + + +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ +
+ + +
+ + + + +
    + + + + + + + + + +
  1. + +

    + 4chan +

    +
    +
    + +
    +
    +

    + 4chan is a simple image-based bulletin board where anyone can post comments and share images anonymously. +

    +
    +
    + https://www.4chan.org/ + + + + + + + + + 2 + + + + + 2 + + + + + 1 + + + + + 5 + + + + + 1 + + + + + + + 11 + +
    +
    +
    + + + + + +

    4chan.org

    + + + + + +
    + +
    +
    + All trackers seen + + + + + + + + + + + +
    + + + + + + +
    + +
    +
    + + + + + + 2 + + + + + 2 + + + + + 1 + + + + + 5 + + + + + 1 + + + + + + 11 +
    +
      + +
    • + + Advertising + 2 +
    • + +
    • + + Site Analytics + 2 +
    • + +
    • + + Audio Video Player + 1 +
    • + +
    • + + Hosting + 5 +
    • + +
    • + + Extensions + 1 +
    • + +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
  2. + + + +
  3. + +

    + 4chan - Wikipedia +

    +
    +
    + +
    +
    +

    + 4chan is an anonymous English-language imageboard website. Launched by Christopher "moot" Poole in October 2003, the site hosts boards dedicated to a wide variety of topics, from video games and television to literature, cooking, weapons, music, history, anime, fitness, politics, and sports, ... +

    +
    +
    + https://en.wikipedia.org/wiki/4chan + + + + + + + + + 3 + + + + + 1 + + + + + 5 + + + + + 1 + + + + + + + 10 + +
    +
    +
    + + + + + +

    wikipedia.org

    + + + + + +
    + +
    +
    + All trackers seen + + + + + + + + + + + +
    + + + + + + +
    + +
    +
    + + + + + + 3 + + + + + 1 + + + + + 5 + + + + + 1 + + + + + + 10 +
    +
      + +
    • + + Advertising + 3 +
    • + +
    • + + Audio Video Player + 1 +
    • + +
    • + + Hosting + 5 +
    • + +
    • + + Extensions + 1 +
    • + +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
  4. + + + +
  5. + +

    + r/4chan +

    +
    +
    + +
    +
    +

    + r/4chan: The stories and information posted here are artistic works of fiction and falsehood. Only a fool would take anything posted here as fact. +

    +
    +
    + https://www.reddit.com/r/4chan/ + + + + + + + + + 5 + + + + + 1 + + + + + 1 + + + + + 1 + + + + + 1 + + + + + 6 + + + + + 1 + + + + + + + 16 + +
    +
    +
    + + + + + +

    reddit.com

    + + + + + +
    + +
    +
    + All trackers seen + + + + + + + + + + + +
    + + + + + + +
    + +
    +
    + + + + + + 5 + + + + + 1 + + + + + 1 + + + + + 1 + + + + + 1 + + + + + 6 + + + + + 1 + + + + + + 16 +
    +
      + +
    • + + Advertising + 5 +
    • + +
    • + + Site Analytics + 1 +
    • + +
    • + + Audio Video Player + 1 +
    • + +
    • + + Misc + 1 +
    • + +
    • + + Social Media + 1 +
    • + +
    • + + Hosting + 6 +
    • + +
    • + + Customer Interaction + 1 +
    • + +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
  6. + + + +
  7. + +

    + GitHub - 4chan/4chan-API: Documentation for 4chan's read-only JSON API. +

    +
    +
    + +
    +
    +

    + Documentation for 4chan's read-only JSON API. Contribute to 4chan/4chan-API development by creating an account on GitHub. +

    +
    +
    + https://github.com/4chan/4chan-API + + + + + + + + + 2 + + + + + 1 + + + + + 4 + + + + + 1 + + + + + + + 8 + +
    +
    +
    + + + + + +

    github.com

    + + + + + +
    + +
    +
    + All trackers seen + + + + + + + + + + + +
    + + + + + + +
    + +
    +
    + + + + + + 2 + + + + + 1 + + + + + 4 + + + + + 1 + + + + + + 8 +
    +
      + +
    • + + Advertising + 2 +
    • + +
    • + + Misc + 1 +
    • + +
    • + + Hosting + 4 +
    • + +
    • + + Customer Interaction + 1 +
    • + +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
  8. + + + +
  9. + +

    + Absolutely everything you need to know to understand 4chan, the Internet’s own bogeyman - The Washington Post +

    +
    +
    + +
    +
    +

    + A comprehensive, no-nonsense guide to one of the Internet's most confusing and influential Web sites. +

    +
    +
    + https://www.washingtonpost.com/news/the-intersect/wp/2014/09/25/absolutely-everything-you-need-to-know-to-understand-4chan-the-internets-own-bogeyman/ + + + + + + + + + 20 + + + + + 6 + + + + + 1 + + + + + 3 + + + + + 2 + + + + + 5 + + + + + + + 37 + +
    +
    +
    + + + + + +

    washingtonpost.com

    + + + + + +
    + +
    +
    + All trackers seen + + + + + + + + + + + +
    + + + + + + +
    + +
    +
    + + + + + + 20 + + + + + 6 + + + + + 1 + + + + + 3 + + + + + 2 + + + + + 5 + + + + + + 37 +
    +
      + +
    • + + Advertising + 20 +
    • + +
    • + + Site Analytics + 6 +
    • + +
    • + + Audio Video Player + 1 +
    • + +
    • + + Misc + 3 +
    • + +
    • + + Social Media + 2 +
    • + +
    • + + Hosting + 5 +
    • + +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
  10. + + + +
  11. + +

    + The Dark Side of 4chan: Exploring the Dangers of an Unmoderated Online Community - The Bullhorn News +

    +
    +
    + +
    + +
    +
  12. + + + +
  13. + +

    + 4chan (@actual4chan) • Instagram photos and videos +

    +
    +
    + +
    +
    +

    + 15K Followers, 11 Following, 1,299 Posts - See Instagram photos and videos from 4chan (@actual4chan) +

    +
    +
    + https://www.instagram.com/actual4chan/ + + + + + + + + + 4 + + + + + 1 + + + + + 1 + + + + + 1 + + + + + 1 + + + + + 5 + + + + + + + 13 + +
    +
    +
    + + + + + +

    instagram.com

    + + + + + +
    + +
    +
    + All trackers seen + + + + + + + + + + + +
    + + + + + + +
    + +
    +
    + + + + + + 4 + + + + + 1 + + + + + 1 + + + + + 1 + + + + + 1 + + + + + 5 + + + + + + 13 +
    +
      + +
    • + + Advertising + 4 +
    • + +
    • + + Site Analytics + 1 +
    • + +
    • + + Audio Video Player + 1 +
    • + +
    • + + Misc + 1 +
    • + +
    • + + Social Media + 1 +
    • + +
    • + + Hosting + 5 +
    • + +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
  14. + + + +
  15. + +

    + 4chan | The Guardian +

    +
    +
    + +
    +
    +

    + Latest news, sport, business, comment, analysis and reviews from the Guardian, the world's leading liberal voice +

    +
    +
    + https://www.theguardian.com/technology/4chan + + + + + + + + + 15 + + + + + 3 + + + + + 1 + + + + + 1 + + + + + 2 + + + + + 4 + + + + + + + 26 + +
    +
    +
    + + + + + +

    theguardian.com

    + + + + + +
    + +
    +
    + All trackers seen + + + + + + + + + + + +
    + + + + + + +
    + +
    +
    + + + + + + 15 + + + + + 3 + + + + + 1 + + + + + 1 + + + + + 2 + + + + + 4 + + + + + + 26 +
    +
      + +
    • + + Advertising + 15 +
    • + +
    • + + Site Analytics + 3 +
    • + +
    • + + Audio Video Player + 1 +
    • + +
    • + + Misc + 1 +
    • + +
    • + + Social Media + 2 +
    • + +
    • + + Hosting + 4 +
    • + +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
  16. + + + +
  17. + +

    + Epic Win for Anonymous: How 4chan's Army Conquered the Web: Stryker, Cole: 9781590207109: Amazon.com: Books +

    +
    +
    + +
    +
    +

    + Epic Win for Anonymous: How 4chan's Army Conquered the Web [Stryker, Cole] on Amazon.com. *FREE* shipping on qualifying offers. Epic Win for Anonymous: How 4chan's Army Conquered the Web +

    +
    +
    + https://www.amazon.com/Epic-Win-4chans-Army-Conquered/dp/1590207106 + + + + + + + + + 18 + + + + + 3 + + + + + 1 + + + + + 5 + + + + + + + 27 + +
    +
    +
    + + + + + +

    amazon.com

    + + + + + +
    + +
    +
    + All trackers seen + + + + + + + + + + + +
    + + + + + + +
    + +
    +
    + + + + + + 18 + + + + + 3 + + + + + 1 + + + + + 5 + + + + + + 27 +
    +
      + +
    • + + Advertising + 18 +
    • + +
    • + + Site Analytics + 3 +
    • + +
    • + + Audio Video Player + 1 +
    • + +
    • + + Hosting + 5 +
    • + +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
  18. + + + +
  19. + +

    + 4chan +

    +
    +
    + +
    +
    +

    + 4chan +

    +
    +
    + https://twitter.com/4chan + + + + + + + + + 8 + + + + + 3 + + + + + 1 + + + + + 1 + + + + + 3 + + + + + 9 + + + + + + + 25 + +
    +
    +
    + + + + + +

    twitter.com

    + + + + + +
    + +
    +
    + All trackers seen + + + + + + + + + + + +
    + + + + + + +
    + +
    +
    + + + + + + 8 + + + + + 3 + + + + + 1 + + + + + 1 + + + + + 3 + + + + + 9 + + + + + + 25 +
    +
      + +
    • + + Advertising + 8 +
    • + +
    • + + Site Analytics + 3 +
    • + +
    • + + Audio Video Player + 1 +
    • + +
    • + + Misc + 1 +
    • + +
    • + + Social Media + 3 +
    • + +
    • + + Hosting + 9 +
    • + +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
  20. + +
+ + + + + +
+

Alternative Search Engines

+ + +
+ +
+ + + + + + + + + + + + + + + + diff --git a/scraper/ghostery.php b/scraper/ghostery.php new file mode 100644 index 0000000..9492f4b --- /dev/null +++ b/scraper/ghostery.php @@ -0,0 +1,308 @@ +backend = new backend("ghostery"); + + include "lib/fuckhtml.php"; + $this->fuckhtml = new fuckhtml(); + } + + public function getfilters($page){ + + if($page != "web"){ + + return []; + } + + return [ + "country" => [ + "display" => "Country", + "option" => [ + "any" => "All regions", + "AR" => "Argentina", + "AU" => "Australia", + "AT" => "Austria", + "BE" => "Belgium", + "BR" => "Brazil", + "CA" => "Canada", + "CL" => "Chile", + "DK" => "Denmark", + "FI" => "Finland", + "FR" => "France", + "DE" => "Germany", + "HK" => "Hong Kong", + "IN" => "India", + "ID" => "Indonesia", + "IT" => "Italy", + "JP" => "Japan", + "KR" => "Korea", + "MY" => "Malaysia", + "MX" => "Mexico", + "NL" => "Netherlands", + "NZ" => "New Zealand", + "NO" => "Norway", + "CN" => "People's Republic of China", + "PL" => "Poland", + "PT" => "Portugal", + "PH" => "Republic of the Philippines", + "RU" => "Russia", + "SA" => "Saudi Arabia", + "ZA" => "South Africa", + "ES" => "Spain", + "SE" => "Sweden", + "CH" => "Switzerland", + "TW" => "Taiwan", + "TR" => "Turkey", + "GB" => "United Kingdom", + "US" => "United States" + ] + ] + ]; + } + + private function get($proxy, $url, $get = [], $country){ + + $curlproc = curl_init(); + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, + ["User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "Referer: https://ghosterysearch.com", + "DNT: 1", + "Sec-GPC: 1", + "Connection: keep-alive", + "Cookie: ctry=" . ($country == "any" ? "--" : $country) . "; noads=true", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: same-origin", + "Sec-Fetch-User: ?1", + "Priority: u=0, i"] + ); + + // http2 bypass + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + return $data; + } + + public function web($get){ + + if($get["npt"]){ + + [$query, $proxy] = $this->backend->get($get["npt"], "web"); + + parse_str($query, $query); + + // country + $country = $query["c"]; + unset($query["c"]); + + $query = http_build_query($query); + + $html = + $this->get( + $proxy, + "https://ghosterysearch.com/search?" . $query, + [], + $country + ); + }else{ + + $proxy = $this->backend->get_ip(); + + $html = + $this->get( + $proxy, + "https://ghosterysearch.com/search", + [ + "q" => $get["s"] + ], + $get["country"] + ); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $results_wrapper = + $this->fuckhtml + ->getElementsByClassName( + "results", + "section" + ); + + if(count($results_wrapper) === 0){ + + throw new Exception("Failed to grep result section"); + } + + $this->fuckhtml->load($results_wrapper[0]); + + // get search results + $results = + $this->fuckhtml + ->getElementsByClassName( + "result", + "li" + ); + + if(count($results) === 0){ + + return $out; + } + + foreach($results as $result){ + + $this->fuckhtml->load($result); + + $a = + $this->fuckhtml + ->getElementsByClassName( + "url", + "a" + ); + + if(count($a) === 0){ + + continue; + } + + $a = $a[0]; + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "h2" + )[0] + ) + ), + "description" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $this->fuckhtml + ->getElementsByTagName( + "p" + )[0] + ) + ), + "url" => + $this->fuckhtml + ->getTextContent( + $a + ["attributes"] + ["href"] + ), + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; + } + + $this->fuckhtml->load($html); + + // get pagination token + $pagination_wrapper = + $this->fuckhtml + ->getElementsByClassName( + "pagination", + "div" + ); + + if(count($pagination_wrapper) !== 0){ + + // found next page! + $this->fuckhtml->load($pagination_wrapper[0]); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($a) !== 0){ + + $q = + parse_url( + $this->fuckhtml + ->getTextContent( + $a[count($a) - 1] + ["attributes"] + ["href"] + ), + PHP_URL_QUERY + ); + + $out["npt"] = + $this->backend + ->store( + $q . "&c=" . $get["country"], + "web", + $proxy + ); + } + } + + return $out; + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } +} -- cgit v1.2.3