diff options
| author | lolcat <will@lolcat.ca> | 2024-07-29 18:25:25 -0400 |
|---|---|---|
| committer | lolcat <will@lolcat.ca> | 2024-07-29 18:25:25 -0400 |
| commit | 4e4796bb714a1672a6f9b6ea6c58a239a2606dd8 (patch) | |
| tree | 95ded0983a39eff8c19aef63e606f1700963425d /scraper/startpage.php | |
| parent | ff06bc1f51cc446966f7214207e2578cd8b18179 (diff) | |
startpage captcha handle
Diffstat (limited to 'scraper/startpage.php')
| -rw-r--r-- | scraper/startpage.php | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/scraper/startpage.php b/scraper/startpage.php index ca15779..ef9def9 100644 --- a/scraper/startpage.php +++ b/scraper/startpage.php @@ -408,6 +408,8 @@ class startpage{ //$html = file_get_contents("scraper/startpage.html"); } + $this->detect_captcha($html); + if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpWeb, ?(.+)\),$/m', @@ -1057,6 +1059,8 @@ class startpage{ } } + $this->detect_captcha($html); + $out = [ "status" => "ok", "npt" => null, @@ -1186,6 +1190,8 @@ class startpage{ } } + $this->detect_captcha($html); + if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpVideos, ?(.+)\),$/m', @@ -1326,6 +1332,8 @@ class startpage{ } } + $this->detect_captcha($html); + if( preg_match( '/React\.createElement\(UIStartpage\.AppSerpNews, ?(.+)\),$/m', @@ -1526,4 +1534,46 @@ class startpage{ $text ); } + + private function detect_captcha($html){ + + $this->fuckhtml->load($html); + + $title = + $this->fuckhtml + ->getElementsByTagName( + "title" + ); + + if( + count($title) !== 0 && + $title[0]["innerHTML"] == "Redirecting..." + ){ + + // check if it's a captcha + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + foreach($as as $a){ + + if( + strpos( + $this->fuckhtml + ->getTextContent( + $a["innerHTML"] + ), + "https://www.startpage.com/sp/captcha" + ) !== false + ){ + + throw new Exception("Startpage returned a captcha"); + } + } + + throw new Exception("Startpage redirected the scraper to an unhandled page"); + } + } } |
