Surface blocked search engine responses

author: Else <else@localhost> 2026-03-18 15:13:18 +0100
committer: frosty <gabriel@bwaaa.monster> 2026-03-18 11:23:51 -0400
commit: 44b6a9b7603e88c7d8f7964effb0b408ce5d1e68 (patch)
tree: 63b2611543a53fbd731a4878e2d7a9b1bdef8cf9 /src
parent: efb9f737fa9f7064601cef71afee7dd74300e908 (diff)
download: omnisearch-44b6a9b7603e88c7d8f7964effb0b408ce5d1e68.tar.gz
3 files changed, 251 insertions, 26 deletions
diff --git a/src/Routes/Search.c b/src/Routes/Search.c
index b9851d7..61465f1 100644
--- a/src/Routes/Search.c
+++ b/src/Routes/Search.c
@@ -155,6 +155,67 @@ static int add_infobox_to_collection(InfoBox *infobox, char ****collection,
   return current_count + 1;
 }
 
+static int add_warning_to_collection(const char *engine_name,
+                                     const char *warning_message,
+                                     char ****collection, int **inner_counts,
+                                     int current_count) {
+  char ***new_collection =
+      (char ***)malloc(sizeof(char **) * (current_count + 1));
+  int *new_inner_counts =
+      (int *)malloc(sizeof(int) * (current_count + 1));
+
+  if (!new_collection || !new_inner_counts) {
+    free(new_collection);
+    free(new_inner_counts);
+    return current_count;
+  }
+
+  if (*collection && current_count > 0) {
+    memcpy(new_collection, *collection, sizeof(char **) * current_count);
+  }
+  if (*inner_counts && current_count > 0) {
+    memcpy(new_inner_counts, *inner_counts, sizeof(int) * current_count);
+  }
+
+  free(*collection);
+  free(*inner_counts);
+
+  *collection = new_collection;
+  *inner_counts = new_inner_counts;
+
+  (*collection)[current_count] = (char **)malloc(sizeof(char *) * 2);
+  if (!(*collection)[current_count])
+    return current_count;
+
+  (*collection)[current_count][0] = strdup(engine_name ? engine_name : "");
+  (*collection)[current_count][1] =
+      strdup(warning_message ? warning_message : "");
+
+  if (!(*collection)[current_count][0] || !(*collection)[current_count][1]) {
+    free((*collection)[current_count][0]);
+    free((*collection)[current_count][1]);
+    free((*collection)[current_count]);
+    return current_count;
+  }
+
+  (*inner_counts)[current_count] = 2;
+  return current_count + 1;
+}
+
+static const char *warning_message_for_job(const ScrapeJob *job) {
+  switch (job->status) {
+  case SCRAPE_STATUS_FETCH_ERROR:
+    return "request failed before OmniSearch could read search results.";
+  case SCRAPE_STATUS_PARSE_MISMATCH:
+    return "returned search results in a format OmniSearch could not parse.";
+  case SCRAPE_STATUS_BLOCKED:
+    return "returned a captcha or another blocking page instead of search "
+           "results.";
+  default:
+    return NULL;
+  }
+}
+
 int results_handler(UrlParams *params) {
   TemplateContext ctx = new_context();
   char *raw_query = "";
@@ -224,6 +285,8 @@ int results_handler(UrlParams *params) {
     jobs[i].response.memory = NULL;
     jobs[i].response.size = 0;
     jobs[i].response.capacity = 0;
+    jobs[i].http_status = 0;
+    jobs[i].status = SCRAPE_STATUS_PENDING;
   }
 
   scrape_engines_parallel(jobs, ENGINE_COUNT);
@@ -260,6 +323,44 @@ int results_handler(UrlParams *params) {
     free(infobox_inner_counts);
   }
 
+  int warning_count = 0;
+  for (int i = 0; i < ENGINE_COUNT; i++) {
+    if (warning_message_for_job(&jobs[i]))
+      warning_count++;
+  }
+
+  if (warning_count > 0) {
+    char ***warning_matrix = NULL;
+    int *warning_inner_counts = NULL;
+    int warning_index = 0;
+
+    for (int i = 0; i < ENGINE_COUNT; i++) {
+      const char *warning_message = warning_message_for_job(&jobs[i]);
+      if (!warning_message)
+        continue;
+
+      warning_index = add_warning_to_collection(
+          jobs[i].engine->name, warning_message, &warning_matrix,
+          &warning_inner_counts, warning_index);
+    }
+
+    if (warning_index > 0) {
+      context_set_array_of_arrays(&ctx, "engine_warnings", warning_matrix,
+                                  warning_index, warning_inner_counts);
+    }
+
+    if (warning_matrix) {
+      for (int i = 0; i < warning_index; i++) {
+        free(warning_matrix[i][0]);
+        free(warning_matrix[i][1]);
+        free(warning_matrix[i]);
+      }
+      free(warning_matrix);
+    }
+    if (warning_inner_counts)
+      free(warning_inner_counts);
+  }
+
   int total_results = 0;
   for (int i = 0; i < ENGINE_COUNT; i++) {
     total_results += jobs[i].results_count;
@@ -281,6 +382,15 @@ int results_handler(UrlParams *params) {
         send_response(html);
         free(html);
       }
+      for (int i = 0; i < ENGINE_COUNT; i++)
+        free(all_results[i]);
+      if (page == 1) {
+        for (int i = 0; i < HANDLER_COUNT; i++) {
+          if (infobox_data[i].success) {
+            free_infobox(&infobox_data[i].result);
+          }
+        }
+      }
       free_context(&ctx);
       return 0;
     }
@@ -368,6 +478,10 @@ int results_handler(UrlParams *params) {
       send_response(html);
       free(html);
     }
+
+    for (int i = 0; i < ENGINE_COUNT; i++) {
+      free(all_results[i]);
+    }
   }
 
   if (page == 1) {
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
index 0709de4..ff8dec8 100644
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -6,8 +6,98 @@
 #include <libxml/HTMLparser.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 #include <time.h>
 
+static int response_contains(const char *response, const char *needle) {
+  return response && needle && strstr(response, needle) != NULL;
+}
+
+static int is_startpage_job(const ScrapeJob *job) {
+  return job && job->engine && strcmp(job->engine->name, "Startpage") == 0;
+}
+
+static int response_is_startpage_captcha(const ScrapeJob *job,
+                                         const char *response) {
+  if (!is_startpage_job(job))
+    return 0;
+
+  return response_contains(response, "<title>Startpage Captcha</title>") ||
+         response_contains(response, "Startpage Captcha") ||
+         response_contains(response, "/static-pages-assets/page-data/captcha/");
+}
+
+static int response_looks_like_results_page(const ScrapeJob *job,
+                                            const char *response) {
+  if (!job || !job->engine || !response)
+    return 0;
+
+  if (strcmp(job->engine->name, "DuckDuckGo Lite") == 0) {
+    return response_contains(response, "result-link") ||
+           response_contains(response, "result-snippet");
+  }
+
+  if (strcmp(job->engine->name, "Startpage") == 0) {
+    return response_contains(response, "<title>Startpage Search Results</title>") ||
+           response_contains(response, "class=\"w-gl") ||
+           response_contains(response, "data-testid=\"gl-title-link\"");
+  }
+
+  if (strcmp(job->engine->name, "Yahoo") == 0) {
+    return response_contains(response, "algo-sr") ||
+           response_contains(response, "compTitle") ||
+           response_contains(response, "compText");
+  }
+
+  return 0;
+}
+
+static void classify_job_response(ScrapeJob *job, const char *response,
+                                  size_t response_size) {
+  job->results_count = 0;
+
+  if (!response || response_size == 0) {
+    job->status = SCRAPE_STATUS_FETCH_ERROR;
+    return;
+  }
+
+  if (response_is_startpage_captcha(job, response)) {
+    job->status = SCRAPE_STATUS_BLOCKED;
+    return;
+  }
+
+  xmlDocPtr doc = htmlReadMemory(response, response_size, NULL, NULL,
+                                 HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
+                                     HTML_PARSE_NOWARNING);
+
+  if (!doc) {
+    job->status = SCRAPE_STATUS_FETCH_ERROR;
+    return;
+  }
+
+  job->results_count =
+      job->engine->parser(job->engine->name, doc, job->out_results,
+                          job->max_results);
+  xmlFreeDoc(doc);
+
+  if (job->results_count > 0) {
+    job->status = SCRAPE_STATUS_OK;
+    return;
+  }
+
+  if (job->http_status >= 400) {
+    job->status = SCRAPE_STATUS_FETCH_ERROR;
+    return;
+  }
+
+  if (response_looks_like_results_page(job, response)) {
+    job->status = SCRAPE_STATUS_PARSE_MISMATCH;
+    return;
+  }
+
+  job->status = SCRAPE_STATUS_EMPTY;
+}
+
 int check_cache_for_job(ScrapeJob *job) {
   if (get_cache_ttl_search() <= 0)
     return 0;
@@ -22,14 +112,14 @@ int check_cache_for_job(ScrapeJob *job) {
   if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data,
                 &cached_size) == 0 &&
       cached_data && cached_size > 0) {
-    xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
-                                   HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
-                                       HTML_PARSE_NOWARNING);
-    if (doc) {
-      job->results_count = job->engine->parser(
-          job->engine->name, doc, job->out_results, job->max_results);
-      xmlFreeDoc(doc);
+    classify_job_response(job, cached_data, cached_size);
+
+    if (job->status == SCRAPE_STATUS_BLOCKED) {
+      free(cached_data);
+      free(key);
+      return 0;
     }
+
     free(cached_data);
     free(key);
 
@@ -46,24 +136,17 @@ int check_cache_for_job(ScrapeJob *job) {
 void parse_and_cache_response(ScrapeJob *job) {
   if (job->response.size == 0) {
     job->results_count = 0;
+    job->status = SCRAPE_STATUS_FETCH_ERROR;
     return;
   }
 
-  char *key = cache_compute_key(job->query, job->page, job->engine->name);
-  if (key && get_cache_ttl_search() > 0)
-    cache_set(key, job->response.memory, job->response.size);
-  free(key);
-
-  xmlDocPtr doc = htmlReadMemory(
-      job->response.memory, job->response.size, NULL, NULL,
-      HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+  classify_job_response(job, job->response.memory, job->response.size);
 
-  if (doc) {
-    job->results_count = job->engine->parser(
-        job->engine->name, doc, job->out_results, job->max_results);
-    xmlFreeDoc(doc);
-  } else {
-    job->results_count = 0;
+  if (job->status == SCRAPE_STATUS_OK || job->status == SCRAPE_STATUS_EMPTY) {
+    char *key = cache_compute_key(job->query, job->page, job->engine->name);
+    if (key && get_cache_ttl_search() > 0)
+      cache_set(key, job->response.memory, job->response.size);
+    free(key);
   }
 }
 
@@ -78,10 +161,14 @@ void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
 }
 
 void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
+  curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &job->http_status);
+
   if (msg->data.result == CURLE_OK)
     parse_and_cache_response(job);
-  else
+  else {
     job->results_count = 0;
+    job->status = SCRAPE_STATUS_FETCH_ERROR;
+  }
 
   cleanup_job_handle(job, handle);
 }
@@ -92,14 +179,20 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
   if (job->response.memory)
     free(job->response.memory);
 
+  job->results_count = 0;
+  job->http_status = 0;
+  job->status = SCRAPE_STATUS_PENDING;
+
   if (check_cache_for_job(job)) {
     job->results_count = job->results_count > 0 ? job->results_count : 0;
     return 0;
   }
 
   char *encoded_query = curl_easy_escape(NULL, job->query, 0);
-  if (!encoded_query)
+  if (!encoded_query) {
+    job->status = SCRAPE_STATUS_FETCH_ERROR;
     return -1;
+  }
 
   char *full_url =
       build_search_url(job->engine->base_url, job->engine->page_param,
@@ -107,12 +200,15 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
                        encoded_query, job->page);
   free(encoded_query);
 
-  if (!full_url)
+  if (!full_url) {
+    job->status = SCRAPE_STATUS_FETCH_ERROR;
     return -1;
+  }
 
   job->handle = curl_easy_init();
   if (!job->handle) {
     free(full_url);
+    job->status = SCRAPE_STATUS_FETCH_ERROR;
     return -1;
   }
 
@@ -160,7 +256,8 @@ int should_retry(ScrapeJob *jobs, int num_jobs) {
     return 0;
 
   for (int i = 0; i < num_jobs; i++) {
-    if (jobs[i].results_count == 0 && jobs[i].response.size == 0)
+    if (jobs[i].status == SCRAPE_STATUS_FETCH_ERROR ||
+        jobs[i].status == SCRAPE_STATUS_BLOCKED)
       return 1;
   }
   return 0;
@@ -170,6 +267,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
   int retries = 0;
 
 retry:
+  ;
   CURLM *multi_handle = curl_multi_init();
   if (!multi_handle)
     return -1;
@@ -213,7 +311,9 @@ int scrape_engine(const SearchEngine *engine, const char *query,
                    .out_results = out_results,
                    .max_results = max_results,
                    .results_count = 0,
-                   .page = 1};
+                   .page = 1,
+                   .http_status = 0,
+                   .status = SCRAPE_STATUS_PENDING};
 
   scrape_engines_parallel(&job, 1);
   return job.results_count;
diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h
index 1439118..014285f 100644
--- a/src/Scraping/Scraping.h
+++ b/src/Scraping/Scraping.h
@@ -32,6 +32,15 @@ typedef struct {
   size_t capacity;
 } MemoryBuffer;
 
+typedef enum {
+  SCRAPE_STATUS_PENDING,
+  SCRAPE_STATUS_OK,
+  SCRAPE_STATUS_EMPTY,
+  SCRAPE_STATUS_FETCH_ERROR,
+  SCRAPE_STATUS_PARSE_MISMATCH,
+  SCRAPE_STATUS_BLOCKED,
+} ScrapeStatus;
+
 typedef struct {
   const SearchEngine *engine;
   char *query;
@@ -41,6 +50,8 @@ typedef struct {
   CURL *handle;
   MemoryBuffer response;
   int results_count;
+  long http_status;
+  ScrapeStatus status;
 } ScrapeJob;
 
 extern const SearchEngine ENGINE_REGISTRY[];
author	Else <else@localhost>	2026-03-18 15:13:18 +0100
committer	frosty <gabriel@bwaaa.monster>	2026-03-18 11:23:51 -0400
commit	44b6a9b7603e88c7d8f7964effb0b408ce5d1e68 (patch)
tree	63b2611543a53fbd731a4878e2d7a9b1bdef8cf9 /src
parent	efb9f737fa9f7064601cef71afee7dd74300e908 (diff)
download	omnisearch-44b6a9b7603e88c7d8f7964effb0b408ce5d1e68.tar.gz