aboutsummaryrefslogtreecommitdiff
path: root/src/Scraping/ImageScraping.c
diff options
context:
space:
mode:
authorfrosty <gabriel@bwaaa.monster>2026-03-17 13:51:12 -0400
committerfrosty <gabriel@bwaaa.monster>2026-03-17 13:51:12 -0400
commitc7b95d05715a45c7790aa8a7e4b0b61bac2e4208 (patch)
treeb0b511b4cc6610949cdde5a6a220724a31c617fd /src/Scraping/ImageScraping.c
parent8c6632502ff992e80051910451421c55894ed9d8 (diff)
downloadomnisearch-c7b95d05715a45c7790aa8a7e4b0b61bac2e4208.tar.gz
fix: refactored scraping components
Diffstat (limited to 'src/Scraping/ImageScraping.c')
-rw-r--r--src/Scraping/ImageScraping.c239
1 files changed, 239 insertions, 0 deletions
diff --git a/src/Scraping/ImageScraping.c b/src/Scraping/ImageScraping.c
new file mode 100644
index 0000000..33f710a
--- /dev/null
+++ b/src/Scraping/ImageScraping.c
@@ -0,0 +1,239 @@
+#include "ImageScraping.h"
+#include "../Utility/HttpClient.h"
+#include "Config.h"
+#include <libxml/HTMLparser.h>
+#include <libxml/xpath.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static char *build_proxy_url(const char *image_url) {
+ if (!image_url)
+ return NULL;
+
+ char *proxy_url = NULL;
+ CURL *curl = curl_easy_init();
+ if (curl) {
+ char *encoded = curl_easy_escape(curl, (char *)image_url, 0);
+ if (encoded) {
+ size_t len = strlen("/proxy?url=") + strlen(encoded) + 1;
+ proxy_url = malloc(len);
+ if (proxy_url)
+ snprintf(proxy_url, len, "/proxy?url=%s", encoded);
+ curl_free(encoded);
+ }
+ curl_easy_cleanup(curl);
+ }
+
+ return proxy_url;
+}
+
+static int parse_image_node(xmlNodePtr node, ImageResult *result) {
+ xmlNodePtr img_node = NULL;
+ xmlNodePtr tit_node = NULL;
+ xmlNodePtr des_node = NULL;
+ xmlNodePtr thumb_link = NULL;
+
+ for (xmlNodePtr child = node->children; child; child = child->next) {
+ if (child->type != XML_ELEMENT_NODE)
+ continue;
+
+ if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) {
+ xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
+ if (class) {
+ if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) {
+ thumb_link = child;
+ for (xmlNodePtr thumb_child = child->children; thumb_child;
+ thumb_child = thumb_child->next) {
+ if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) {
+ xmlChar *div_class =
+ xmlGetProp(thumb_child, (const xmlChar *)"class");
+ if (div_class &&
+ xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) {
+ for (xmlNodePtr cico_child = thumb_child->children; cico_child;
+ cico_child = cico_child->next) {
+ if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") ==
+ 0) {
+ img_node = cico_child;
+ break;
+ }
+ }
+ }
+ if (div_class)
+ xmlFree(div_class);
+ }
+ }
+ } else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) {
+ tit_node = child;
+ }
+ xmlFree(class);
+ }
+ } else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) {
+ xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
+ if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) {
+ for (xmlNodePtr meta_child = child->children; meta_child;
+ meta_child = meta_child->next) {
+ if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) {
+ xmlChar *div_class =
+ xmlGetProp(meta_child, (const xmlChar *)"class");
+ if (div_class) {
+ if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) {
+ des_node = meta_child;
+ }
+ xmlFree(div_class);
+ }
+ } else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) {
+ xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class");
+ if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) {
+ tit_node = meta_child;
+ }
+ if (a_class)
+ xmlFree(a_class);
+ }
+ }
+ }
+ if (class)
+ xmlFree(class);
+ }
+ }
+
+ xmlChar *iurl =
+ img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL;
+ xmlChar *full_url =
+ thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL;
+ xmlChar *title = des_node ? xmlNodeGetContent(des_node)
+ : (tit_node ? xmlNodeGetContent(tit_node) : NULL);
+ xmlChar *rurl =
+ tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL;
+
+ if (!iurl || strlen((char *)iurl) == 0) {
+ if (iurl)
+ xmlFree(iurl);
+ if (title)
+ xmlFree(title);
+ if (rurl)
+ xmlFree(rurl);
+ if (full_url)
+ xmlFree(full_url);
+ return 0;
+ }
+
+ char *proxy_url = build_proxy_url((char *)iurl);
+ result->thumbnail_url = proxy_url ? strdup(proxy_url) : strdup((char *)iurl);
+ free(proxy_url);
+ result->title = strdup(title ? (char *)title : "Image");
+ result->page_url = strdup(rurl ? (char *)rurl : "#");
+ result->full_url = strdup(full_url ? (char *)full_url : "#");
+
+ if (iurl)
+ xmlFree(iurl);
+ if (title)
+ xmlFree(title);
+ if (rurl)
+ xmlFree(rurl);
+ if (full_url)
+ xmlFree(full_url);
+
+ return 1;
+}
+
+int scrape_images(const char *query, int page, ImageResult **out_results,
+ int *out_count) {
+ *out_results = NULL;
+ *out_count = 0;
+
+ if (!query || strlen(query) == 0)
+ return -1;
+
+ CURL *tmp = curl_easy_init();
+ if (!tmp)
+ return -1;
+
+ char *encoded_query = curl_easy_escape(tmp, query, 0);
+ curl_easy_cleanup(tmp);
+
+ if (!encoded_query)
+ return -1;
+
+ char url[BUFFER_SIZE_LARGE];
+ int first = (page - 1) * IMAGE_RESULTS_PER_PAGE + 1;
+ snprintf(url, sizeof(url), "%s?q=%s&first=%d", BING_IMAGE_URL, encoded_query,
+ first);
+ free(encoded_query);
+
+ HttpResponse resp = http_get(
+ url,
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
+ if (!resp.memory) {
+ return -1;
+ }
+
+ htmlDocPtr doc = htmlReadMemory(resp.memory, resp.size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR);
+ if (!doc) {
+ http_response_free(&resp);
+ return -1;
+ }
+
+ xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ if (!xpathCtx) {
+ xmlFreeDoc(doc);
+ http_response_free(&resp);
+ return -1;
+ }
+
+ xmlXPathObjectPtr xpathObj =
+ xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx);
+
+ if (!xpathObj || !xpathObj->nodesetval) {
+ if (xpathObj)
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ xmlFreeDoc(doc);
+ http_response_free(&resp);
+ return 0;
+ }
+
+ int nodes = xpathObj->nodesetval->nodeNr;
+ int max_images =
+ (nodes < IMAGE_RESULTS_PER_PAGE) ? nodes : IMAGE_RESULTS_PER_PAGE;
+
+ ImageResult *results = malloc(sizeof(ImageResult) * max_images);
+ if (!results) {
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ xmlFreeDoc(doc);
+ http_response_free(&resp);
+ return -1;
+ }
+
+ int count = 0;
+ for (int i = 0; i < nodes && count < IMAGE_RESULTS_PER_PAGE; i++) {
+ xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
+ if (parse_image_node(node, &results[count])) {
+ count++;
+ }
+ }
+
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ xmlFreeDoc(doc);
+ http_response_free(&resp);
+
+ *out_results = results;
+ *out_count = count;
+ return 0;
+}
+
+void free_image_results(ImageResult *results, int count) {
+ if (!results)
+ return;
+
+ for (int i = 0; i < count; i++) {
+ free(results[i].thumbnail_url);
+ free(results[i].title);
+ free(results[i].page_url);
+ free(results[i].full_url);
+ }
+ free(results);
+}