fix: some attempts to resolve some issues with images

author: frosty <gabriel@bwaaa.monster> 2026-06-02 18:18:45 -0400
committer: frosty <gabriel@bwaaa.monster> 2026-06-02 18:18:45 -0400
commit: c26a08c6a29416b3c59f1b2c9f65335b4409ce4f (patch)
tree: ab0a2532afe7c78efc6689611865e03bc4e9e005
parent: 5808459c758db353d8d39df556ae42028e762321 (diff)
download: omnisearch-c26a08c6a29416b3c59f1b2c9f65335b4409ce4f.tar.gz
2 files changed, 95 insertions, 104 deletions
diff --git a/src/Scraping/ImageScraping.c b/src/Scraping/ImageScraping.c
index 33f710a..2341244 100644
--- a/src/Scraping/ImageScraping.c
+++ b/src/Scraping/ImageScraping.c
@@ -28,113 +28,82 @@ static char *build_proxy_url(const char *image_url) {
   return proxy_url;
 }
 
-static int parse_image_node(xmlNodePtr node, ImageResult *result) {
-  xmlNodePtr img_node = NULL;
-  xmlNodePtr tit_node = NULL;
-  xmlNodePtr des_node = NULL;
-  xmlNodePtr thumb_link = NULL;
+static char *extract_json_string(const char *json, const char *key) {
+  if (!json || !key)
+    return NULL;
 
-  for (xmlNodePtr child = node->children; child; child = child->next) {
-    if (child->type != XML_ELEMENT_NODE)
-      continue;
+  char search_key[64];
+  snprintf(search_key, sizeof(search_key), "\"%s\"", key);
 
-    if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) {
-      xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
-      if (class) {
-        if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) {
-          thumb_link = child;
-          for (xmlNodePtr thumb_child = child->children; thumb_child;
-               thumb_child = thumb_child->next) {
-            if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) {
-              xmlChar *div_class =
-                  xmlGetProp(thumb_child, (const xmlChar *)"class");
-              if (div_class &&
-                  xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) {
-                for (xmlNodePtr cico_child = thumb_child->children; cico_child;
-                     cico_child = cico_child->next) {
-                  if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") ==
-                      0) {
-                    img_node = cico_child;
-                    break;
-                  }
-                }
-              }
-              if (div_class)
-                xmlFree(div_class);
-            }
-          }
-        } else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) {
-          tit_node = child;
-        }
-        xmlFree(class);
-      }
-    } else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) {
-      xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
-      if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) {
-        for (xmlNodePtr meta_child = child->children; meta_child;
-             meta_child = meta_child->next) {
-          if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) {
-            xmlChar *div_class =
-                xmlGetProp(meta_child, (const xmlChar *)"class");
-            if (div_class) {
-              if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) {
-                des_node = meta_child;
-              }
-              xmlFree(div_class);
-            }
-          } else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) {
-            xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class");
-            if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) {
-              tit_node = meta_child;
-            }
-            if (a_class)
-              xmlFree(a_class);
-          }
-        }
-      }
-      if (class)
-        xmlFree(class);
-    }
+  const char *key_pos = strstr(json, search_key);
+  if (!key_pos)
+    return NULL;
+
+  const char *colon = strchr(key_pos + strlen(search_key), ':');
+  if (!colon)
+    return NULL;
+
+  colon++;
+  while (*colon == ' ' || *colon == '\t' || *colon == '\n' || *colon == '\r')
+    colon++;
+
+  if (*colon != '"')
+    return NULL;
+  colon++;
+
+  size_t len = 0;
+  const char *start = colon;
+  while (*colon && *colon != '"') {
+    if (*colon == '\\' && *(colon + 1))
+      colon++;
+    colon++;
+    len++;
   }
 
-  xmlChar *iurl =
-      img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL;
-  xmlChar *full_url =
-      thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL;
-  xmlChar *title = des_node ? xmlNodeGetContent(des_node)
-                            : (tit_node ? xmlNodeGetContent(tit_node) : NULL);
-  xmlChar *rurl =
-      tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL;
+  char *result = malloc(len + 1);
+  if (!result)
+    return NULL;
 
-  if (!iurl || strlen((char *)iurl) == 0) {
-    if (iurl)
-      xmlFree(iurl);
-    if (title)
-      xmlFree(title);
-    if (rurl)
-      xmlFree(rurl);
-    if (full_url)
-      xmlFree(full_url);
-    return 0;
+  colon = start;
+  size_t i = 0;
+  while (*colon && *colon != '"') {
+    if (*colon == '\\' && *(colon + 1))
+      colon++;
+    result[i++] = *colon++;
   }
+  result[i] = '\0';
+
+  return result;
+}
 
-  char *proxy_url = build_proxy_url((char *)iurl);
-  result->thumbnail_url = proxy_url ? strdup(proxy_url) : strdup((char *)iurl);
-  free(proxy_url);
-  result->title = strdup(title ? (char *)title : "Image");
-  result->page_url = strdup(rurl ? (char *)rurl : "#");
-  result->full_url = strdup(full_url ? (char *)full_url : "#");
+static int parse_iusc_node(xmlNodePtr node, ImageResult *result) {
+  xmlChar *m_attr = xmlGetProp(node, (const xmlChar *)"m");
+  if (!m_attr)
+    return 0;
+
+  char *turl = extract_json_string((const char *)m_attr, "turl");
+  char *murl = extract_json_string((const char *)m_attr, "murl");
+  char *purl = extract_json_string((const char *)m_attr, "purl");
+  char *title = extract_json_string((const char *)m_attr, "t");
+
+  int ok = (turl != NULL && strlen(turl) > 0);
+  if (ok) {
+    char *proxy_url = build_proxy_url(turl);
+    result->thumbnail_url = proxy_url ? strdup(proxy_url) : strdup(turl);
+    free(proxy_url);
+    result->title =
+        title && strlen(title) > 0 ? strdup(title) : strdup("Image");
+    result->page_url = purl && strlen(purl) > 0 ? strdup(purl) : strdup("#");
+    result->full_url = murl && strlen(murl) > 0 ? strdup(murl) : strdup("#");
+  }
 
-  if (iurl)
-    xmlFree(iurl);
-  if (title)
-    xmlFree(title);
-  if (rurl)
-    xmlFree(rurl);
-  if (full_url)
-    xmlFree(full_url);
+  free(turl);
+  free(murl);
+  free(purl);
+  free(title);
 
-  return 1;
+  xmlFree(m_attr);
+  return ok;
 }
 
 int scrape_images(const char *query, int page, ImageResult **out_results,
@@ -157,13 +126,16 @@ int scrape_images(const char *query, int page, ImageResult **out_results,
 
   char url[BUFFER_SIZE_LARGE];
   int first = (page - 1) * IMAGE_RESULTS_PER_PAGE + 1;
-  snprintf(url, sizeof(url), "%s?q=%s&first=%d", BING_IMAGE_URL, encoded_query,
-           first);
+  snprintf(
+      url, sizeof(url),
+      "https://www.bing.com/images/async?q=%s&async=content&first=%d&count=%d",
+      encoded_query, first, 35);
   free(encoded_query);
 
   HttpResponse resp = http_get(
       url,
-      "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
+      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
+      "like Gecko) Chrome/120.0.0.0 Safari/537.36");
   if (!resp.memory) {
     return -1;
   }
@@ -183,7 +155,7 @@ int scrape_images(const char *query, int page, ImageResult **out_results,
   }
 
   xmlXPathObjectPtr xpathObj =
-      xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx);
+      xmlXPathEvalExpression((const xmlChar *)"//a[@class='iusc']", xpathCtx);
 
   if (!xpathObj || !xpathObj->nodesetval) {
     if (xpathObj)
@@ -210,7 +182,7 @@ int scrape_images(const char *query, int page, ImageResult **out_results,
   int count = 0;
   for (int i = 0; i < nodes && count < IMAGE_RESULTS_PER_PAGE; i++) {
     xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
-    if (parse_image_node(node, &results[count])) {
+    if (parse_iusc_node(node, &results[count])) {
       count++;
     }
   }
diff --git a/src/Utility/HttpClient.c b/src/Utility/HttpClient.c
index bdd2f4d..0ffb9ff 100644
--- a/src/Utility/HttpClient.c
+++ b/src/Utility/HttpClient.c
@@ -31,6 +31,17 @@ static size_t write_callback(void *contents, size_t size, size_t nmemb,
   return realsize;
 }
 
+static struct curl_slist *build_http_headers(void) {
+  struct curl_slist *headers = NULL;
+  headers = curl_slist_append(
+      headers,
+      "Accept: "
+      "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
+  headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
+  headers = curl_slist_append(headers, "DNT: 1");
+  return headers;
+}
+
 HttpResponse http_get(const char *url, const char *user_agent) {
   HttpResponse resp = {.memory = NULL, .size = 0, .capacity = 0};
 
@@ -51,16 +62,24 @@ HttpResponse http_get(const char *url, const char *user_agent) {
     return resp;
   }
 
+  struct curl_slist *headers = build_http_headers();
+
   curl_easy_setopt(curl, CURLOPT_URL, url);
+  curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
   curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
   curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp);
   curl_easy_setopt(curl, CURLOPT_USERAGENT,
                    user_agent ? user_agent : "libcurl-agent/1.0");
   curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
-  curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
+  curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT_SECS);
+  curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+  curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
+  curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT_SECS);
+  curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
   apply_proxy_settings(curl);
 
   CURLcode res = curl_easy_perform(curl);
+  curl_slist_free_all(headers);
   curl_easy_cleanup(curl);
 
   if (res != CURLE_OK) {
author	frosty <gabriel@bwaaa.monster>	2026-06-02 18:18:45 -0400
committer	frosty <gabriel@bwaaa.monster>	2026-06-02 18:18:45 -0400
commit	c26a08c6a29416b3c59f1b2c9f65335b4409ce4f (patch)
tree	ab0a2532afe7c78efc6689611865e03bc4e9e005
parent	5808459c758db353d8d39df556ae42028e762321 (diff)
download	omnisearch-c26a08c6a29416b3c59f1b2c9f65335b4409ce4f.tar.gz