aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/Config.h7
-rw-r--r--src/Routes/Images.c249
-rw-r--r--src/Scraping/ImageScraping.c239
-rw-r--r--src/Scraping/ImageScraping.h18
-rw-r--r--src/Scraping/Scraping.c589
-rw-r--r--src/Scraping/Scraping.h20
-rw-r--r--src/Scraping/ScrapingHttp.c109
-rw-r--r--src/Scraping/ScrapingParsers.c269
8 files changed, 808 insertions, 692 deletions
diff --git a/src/Config.h b/src/Config.h
index 24dafe6..e0e242c 100644
--- a/src/Config.h
+++ b/src/Config.h
@@ -23,6 +23,13 @@
#define INFOBOX_FIELD_COUNT 4
#define MAX_RESULTS_PER_ENGINE 10
+#define CURL_TIMEOUT_SECS 15L
+#define CURL_DNS_TIMEOUT_SECS 300L
+
+#define BING_IMAGE_URL "https://www.bing.com/images/search"
+#define IMAGE_RESULTS_PER_PAGE 32
+#define IMAGE_RESULT_FIELDS 4
+
typedef struct {
char host[256];
int port;
diff --git a/src/Routes/Images.c b/src/Routes/Images.c
index 7536f6b..ae25cf8 100644
--- a/src/Routes/Images.c
+++ b/src/Routes/Images.c
@@ -1,15 +1,7 @@
#include "Images.h"
-#include "../Scraping/Scraping.h"
-#include "../Utility/HttpClient.h"
+#include "../Scraping/ImageScraping.h"
#include "../Utility/Unescape.h"
-#include "../Utility/XmlHelper.h"
-
-#include <curl/curl.h>
-#include <libxml/HTMLparser.h>
-#include <libxml/xpath.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include "Config.h"
int images_handler(UrlParams *params) {
TemplateContext ctx = new_context();
@@ -28,12 +20,12 @@ int images_handler(UrlParams *params) {
}
}
- context_set(&ctx, "query", raw_query);
-
char page_str[16], prev_str[16], next_str[16];
snprintf(page_str, sizeof(page_str), "%d", page);
snprintf(prev_str, sizeof(prev_str), "%d", page > 1 ? page - 1 : 0);
snprintf(next_str, sizeof(next_str), "%d", page + 1);
+
+ context_set(&ctx, "query", raw_query);
context_set(&ctx, "page", page_str);
context_set(&ctx, "prev_page", prev_str);
context_set(&ctx, "next_page", next_str);
@@ -49,208 +41,41 @@ int images_handler(UrlParams *params) {
return -1;
}
- CURL *tmp = curl_easy_init();
- if (!tmp) {
- send_response("<h1>Error initializing curl</h1>");
- if (display_query)
- free(display_query);
- free_context(&ctx);
- return -1;
- }
- char *encoded_query = curl_easy_escape(tmp, raw_query, 0);
- curl_easy_cleanup(tmp);
-
- if (!encoded_query) {
- send_response("<h1>Error encoding query</h1>");
- if (display_query)
- free(display_query);
- free_context(&ctx);
- return -1;
- }
-
- char url[1024];
- int first = (page - 1) * 32 + 1;
- snprintf(url, sizeof(url), "https://www.bing.com/images/search?q=%s&first=%d",
- encoded_query, first);
+ ImageResult *results = NULL;
+ int result_count = 0;
- HttpResponse resp = http_get(
- url,
- "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
- if (!resp.memory) {
+ if (scrape_images(raw_query, page, &results, &result_count) != 0 ||
+ !results) {
send_response("<h1>Error fetching images</h1>");
- free(encoded_query);
- free(display_query);
- free_context(&ctx);
- return -1;
- }
-
- htmlDocPtr doc = htmlReadMemory(resp.memory, resp.size, NULL, NULL,
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR);
- if (!doc) {
- http_response_free(&resp);
- free(encoded_query);
free(display_query);
free_context(&ctx);
return -1;
}
- xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ char ***image_matrix = malloc(sizeof(char **) * result_count);
+ int *inner_counts = malloc(sizeof(int) * result_count);
- if (!xpathCtx) {
- xmlFreeDoc(doc);
- http_response_free(&resp);
- free(encoded_query);
+ if (!image_matrix || !inner_counts) {
+ if (image_matrix)
+ free(image_matrix);
+ if (inner_counts)
+ free(inner_counts);
+ free_image_results(results, result_count);
free(display_query);
free_context(&ctx);
return -1;
}
- xmlXPathObjectPtr xpathObj =
- xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx);
-
- int image_count = 0;
- char ***image_matrix = NULL;
- int *inner_counts = NULL;
-
- if (xpathObj && xpathObj->nodesetval) {
- int nodes = xpathObj->nodesetval->nodeNr;
-
- int max_images = (nodes < 32) ? nodes : 32;
- image_matrix = malloc(sizeof(char **) * max_images);
- inner_counts = malloc(sizeof(int) * max_images);
- if (!image_matrix || !inner_counts) {
- if (image_matrix) free(image_matrix);
- if (inner_counts) free(inner_counts);
- image_matrix = NULL;
- inner_counts = NULL;
- }
-
- for (int i = 0; i < nodes; i++) {
- if (image_count >= 32)
- break;
-
- xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
- xmlNodePtr img_node = NULL;
- xmlNodePtr tit_node = NULL;
- xmlNodePtr des_node = NULL;
- xmlNodePtr thumb_link = NULL;
-
- for (xmlNodePtr child = node->children; child; child = child->next) {
- if (child->type != XML_ELEMENT_NODE)
- continue;
-
- if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) {
- xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
- if (class) {
- if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) {
- thumb_link = child;
- for (xmlNodePtr thumb_child = child->children; thumb_child;
- thumb_child = thumb_child->next) {
- if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) {
- xmlChar *div_class =
- xmlGetProp(thumb_child, (const xmlChar *)"class");
- if (div_class &&
- xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) {
- for (xmlNodePtr cico_child = thumb_child->children;
- cico_child; cico_child = cico_child->next) {
- if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") ==
- 0) {
- img_node = cico_child;
- break;
- }
- }
- }
- if (div_class)
- xmlFree(div_class);
- }
- }
- } else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) {
- tit_node = child;
- }
- xmlFree(class);
- }
- } else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) {
- xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
- if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) {
- for (xmlNodePtr meta_child = child->children; meta_child;
- meta_child = meta_child->next) {
- if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) {
- xmlChar *div_class =
- xmlGetProp(meta_child, (const xmlChar *)"class");
- if (div_class) {
- if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) {
- des_node = meta_child;
- }
- xmlFree(div_class);
- }
- } else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") ==
- 0) {
- xmlChar *a_class =
- xmlGetProp(meta_child, (const xmlChar *)"class");
- if (a_class &&
- xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) {
- tit_node = meta_child;
- }
- if (a_class)
- xmlFree(a_class);
- }
- }
- }
- if (class)
- xmlFree(class);
- }
- }
-
- xmlChar *iurl =
- img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL;
- xmlChar *full_url =
- thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL;
- xmlChar *title = des_node
- ? xmlNodeGetContent(des_node)
- : (tit_node ? xmlNodeGetContent(tit_node) : NULL);
- xmlChar *rurl =
- tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL;
-
- if (iurl && strlen((char *)iurl) > 0) {
- char *proxy_url = NULL;
- CURL *esc_curl = curl_easy_init();
- if (esc_curl) {
- char *encoded = curl_easy_escape(esc_curl, (char *)iurl, 0);
- if (encoded) {
- size_t proxy_len = strlen("/proxy?url=") + strlen(encoded) + 1;
- proxy_url = malloc(proxy_len);
- if (proxy_url) {
- snprintf(proxy_url, proxy_len, "/proxy?url=%s", encoded);
- }
- curl_free(encoded);
- }
- curl_easy_cleanup(esc_curl);
- }
-
- image_matrix[image_count] = malloc(sizeof(char *) * 4);
- image_matrix[image_count][0] =
- proxy_url ? strdup(proxy_url) : strdup((char *)iurl);
- free(proxy_url);
- image_matrix[image_count][1] = strdup(title ? (char *)title : "Image");
- image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#");
- image_matrix[image_count][3] =
- strdup(full_url ? (char *)full_url : "#");
- inner_counts[image_count] = 4;
- image_count++;
- }
-
- if (iurl)
- xmlFree(iurl);
- if (title)
- xmlFree(title);
- if (rurl)
- xmlFree(rurl);
- if (full_url)
- xmlFree(full_url);
- }
+ for (int i = 0; i < result_count; i++) {
+ image_matrix[i] = malloc(sizeof(char *) * IMAGE_RESULT_FIELDS);
+ image_matrix[i][0] = strdup(results[i].thumbnail_url);
+ image_matrix[i][1] = strdup(results[i].title);
+ image_matrix[i][2] = strdup(results[i].page_url);
+ image_matrix[i][3] = strdup(results[i].full_url);
+ inner_counts[i] = IMAGE_RESULT_FIELDS;
}
- context_set_array_of_arrays(&ctx, "images", image_matrix, image_count,
+ context_set_array_of_arrays(&ctx, "images", image_matrix, result_count,
inner_counts);
char *rendered = render_template("images.html", &ctx);
@@ -261,27 +86,15 @@ int images_handler(UrlParams *params) {
send_response("<h1>Error rendering image results</h1>");
}
- if (image_matrix) {
- for (int i = 0; i < image_count; i++) {
- for (int j = 0; j < 4; j++) {
- free(image_matrix[i][j]);
- }
- free(image_matrix[i]);
- }
- free(image_matrix);
- }
- if (inner_counts) {
- free(inner_counts);
+ for (int i = 0; i < result_count; i++) {
+ for (int j = 0; j < IMAGE_RESULT_FIELDS; j++)
+ free(image_matrix[i][j]);
+ free(image_matrix[i]);
}
+ free(image_matrix);
+ free(inner_counts);
- if (xpathObj)
- xmlXPathFreeObject(xpathObj);
- if (xpathCtx)
- xmlXPathFreeContext(xpathCtx);
- if (doc)
- xmlFreeDoc(doc);
- http_response_free(&resp);
- curl_free(encoded_query);
+ free_image_results(results, result_count);
free(display_query);
free_context(&ctx);
diff --git a/src/Scraping/ImageScraping.c b/src/Scraping/ImageScraping.c
new file mode 100644
index 0000000..33f710a
--- /dev/null
+++ b/src/Scraping/ImageScraping.c
@@ -0,0 +1,239 @@
+#include "ImageScraping.h"
+#include "../Utility/HttpClient.h"
+#include "Config.h"
+#include <libxml/HTMLparser.h>
+#include <libxml/xpath.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static char *build_proxy_url(const char *image_url) {
+ if (!image_url)
+ return NULL;
+
+ char *proxy_url = NULL;
+ CURL *curl = curl_easy_init();
+ if (curl) {
+ char *encoded = curl_easy_escape(curl, (char *)image_url, 0);
+ if (encoded) {
+ size_t len = strlen("/proxy?url=") + strlen(encoded) + 1;
+ proxy_url = malloc(len);
+ if (proxy_url)
+ snprintf(proxy_url, len, "/proxy?url=%s", encoded);
+ curl_free(encoded);
+ }
+ curl_easy_cleanup(curl);
+ }
+
+ return proxy_url;
+}
+
+static int parse_image_node(xmlNodePtr node, ImageResult *result) {
+ xmlNodePtr img_node = NULL;
+ xmlNodePtr tit_node = NULL;
+ xmlNodePtr des_node = NULL;
+ xmlNodePtr thumb_link = NULL;
+
+ for (xmlNodePtr child = node->children; child; child = child->next) {
+ if (child->type != XML_ELEMENT_NODE)
+ continue;
+
+ if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) {
+ xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
+ if (class) {
+ if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) {
+ thumb_link = child;
+ for (xmlNodePtr thumb_child = child->children; thumb_child;
+ thumb_child = thumb_child->next) {
+ if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) {
+ xmlChar *div_class =
+ xmlGetProp(thumb_child, (const xmlChar *)"class");
+ if (div_class &&
+ xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) {
+ for (xmlNodePtr cico_child = thumb_child->children; cico_child;
+ cico_child = cico_child->next) {
+ if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") ==
+ 0) {
+ img_node = cico_child;
+ break;
+ }
+ }
+ }
+ if (div_class)
+ xmlFree(div_class);
+ }
+ }
+ } else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) {
+ tit_node = child;
+ }
+ xmlFree(class);
+ }
+ } else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) {
+ xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
+ if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) {
+ for (xmlNodePtr meta_child = child->children; meta_child;
+ meta_child = meta_child->next) {
+ if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) {
+ xmlChar *div_class =
+ xmlGetProp(meta_child, (const xmlChar *)"class");
+ if (div_class) {
+ if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) {
+ des_node = meta_child;
+ }
+ xmlFree(div_class);
+ }
+ } else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) {
+ xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class");
+ if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) {
+ tit_node = meta_child;
+ }
+ if (a_class)
+ xmlFree(a_class);
+ }
+ }
+ }
+ if (class)
+ xmlFree(class);
+ }
+ }
+
+ xmlChar *iurl =
+ img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL;
+ xmlChar *full_url =
+ thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL;
+ xmlChar *title = des_node ? xmlNodeGetContent(des_node)
+ : (tit_node ? xmlNodeGetContent(tit_node) : NULL);
+ xmlChar *rurl =
+ tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL;
+
+ if (!iurl || strlen((char *)iurl) == 0) {
+ if (iurl)
+ xmlFree(iurl);
+ if (title)
+ xmlFree(title);
+ if (rurl)
+ xmlFree(rurl);
+ if (full_url)
+ xmlFree(full_url);
+ return 0;
+ }
+
+ char *proxy_url = build_proxy_url((char *)iurl);
+ result->thumbnail_url = proxy_url ? strdup(proxy_url) : strdup((char *)iurl);
+ free(proxy_url);
+ result->title = strdup(title ? (char *)title : "Image");
+ result->page_url = strdup(rurl ? (char *)rurl : "#");
+ result->full_url = strdup(full_url ? (char *)full_url : "#");
+
+ if (iurl)
+ xmlFree(iurl);
+ if (title)
+ xmlFree(title);
+ if (rurl)
+ xmlFree(rurl);
+ if (full_url)
+ xmlFree(full_url);
+
+ return 1;
+}
+
+int scrape_images(const char *query, int page, ImageResult **out_results,
+ int *out_count) {
+ *out_results = NULL;
+ *out_count = 0;
+
+ if (!query || strlen(query) == 0)
+ return -1;
+
+ CURL *tmp = curl_easy_init();
+ if (!tmp)
+ return -1;
+
+ char *encoded_query = curl_easy_escape(tmp, query, 0);
+ curl_easy_cleanup(tmp);
+
+ if (!encoded_query)
+ return -1;
+
+ char url[BUFFER_SIZE_LARGE];
+ int first = (page - 1) * IMAGE_RESULTS_PER_PAGE + 1;
+ snprintf(url, sizeof(url), "%s?q=%s&first=%d", BING_IMAGE_URL, encoded_query,
+ first);
+ free(encoded_query);
+
+ HttpResponse resp = http_get(
+ url,
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
+ if (!resp.memory) {
+ return -1;
+ }
+
+ htmlDocPtr doc = htmlReadMemory(resp.memory, resp.size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR);
+ if (!doc) {
+ http_response_free(&resp);
+ return -1;
+ }
+
+ xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ if (!xpathCtx) {
+ xmlFreeDoc(doc);
+ http_response_free(&resp);
+ return -1;
+ }
+
+ xmlXPathObjectPtr xpathObj =
+ xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx);
+
+ if (!xpathObj || !xpathObj->nodesetval) {
+ if (xpathObj)
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ xmlFreeDoc(doc);
+ http_response_free(&resp);
+ return 0;
+ }
+
+ int nodes = xpathObj->nodesetval->nodeNr;
+ int max_images =
+ (nodes < IMAGE_RESULTS_PER_PAGE) ? nodes : IMAGE_RESULTS_PER_PAGE;
+
+ ImageResult *results = malloc(sizeof(ImageResult) * max_images);
+ if (!results) {
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ xmlFreeDoc(doc);
+ http_response_free(&resp);
+ return -1;
+ }
+
+ int count = 0;
+ for (int i = 0; i < nodes && count < IMAGE_RESULTS_PER_PAGE; i++) {
+ xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
+ if (parse_image_node(node, &results[count])) {
+ count++;
+ }
+ }
+
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ xmlFreeDoc(doc);
+ http_response_free(&resp);
+
+ *out_results = results;
+ *out_count = count;
+ return 0;
+}
+
+void free_image_results(ImageResult *results, int count) {
+ if (!results)
+ return;
+
+ for (int i = 0; i < count; i++) {
+ free(results[i].thumbnail_url);
+ free(results[i].title);
+ free(results[i].page_url);
+ free(results[i].full_url);
+ }
+ free(results);
+}
diff --git a/src/Scraping/ImageScraping.h b/src/Scraping/ImageScraping.h
new file mode 100644
index 0000000..d244a63
--- /dev/null
+++ b/src/Scraping/ImageScraping.h
@@ -0,0 +1,18 @@
+#ifndef IMAGESCRAPING_H
+#define IMAGESCRAPING_H
+
+#include <curl/curl.h>
+#include <libxml/HTMLparser.h>
+
+typedef struct {
+ char *thumbnail_url;
+ char *title;
+ char *page_url;
+ char *full_url;
+} ImageResult;
+
+int scrape_images(const char *query, int page, ImageResult **out_results,
+ int *out_count);
+void free_image_results(ImageResult *results, int count);
+
+#endif
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
index 4c87890..baf536c 100644
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -1,395 +1,20 @@
#include "Scraping.h"
#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
-#include "../Utility/Unescape.h"
-#include "../Utility/XmlHelper.h"
#include "Config.h"
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
-#include <libxml/xpath.h>
#include <stdio.h>
#include <stdlib.h>
-#include <string.h>
#include <time.h>
-#include <unistd.h>
-
-static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb,
- void *userp) {
- size_t realsize = size * nmemb;
- MemoryBuffer *mem = (MemoryBuffer *)userp;
-
- if (mem->size + realsize + 1 > mem->capacity) {
- size_t new_cap =
- mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2;
- while (new_cap < mem->size + realsize + 1)
- new_cap *= 2;
-
- char *ptr = (char *)realloc(mem->memory, new_cap);
- if (!ptr) {
- return 0;
- }
- mem->memory = ptr;
- mem->capacity = new_cap;
- }
-
- memcpy(&(mem->memory[mem->size]), contents, realsize);
- mem->size += realsize;
- mem->memory[mem->size] = 0;
-
- return realsize;
-}
-
-static const char *get_random_user_agent(void) {
- static const char *agents[] = {
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
- "like Gecko) Chrome/120.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
- "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
- "Gecko) "
- "Chrome/120.0.0.0` Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
- "Firefox/121.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
- "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
- return agents[rand() % 5];
-}
-
-static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
- SearchResult **out_results, int max_results) {
- (void)engine_name;
- int found_count = 0;
-
- xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
- if (!xpathCtx) {
- return 0;
- }
-
- xmlXPathObjectPtr xpathObj = xml_xpath_eval(
- xpathCtx, "//tr[not(contains(@class, "
- "'result-sponsored'))]//a[@class='result-link']");
-
- if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
- if (xpathObj)
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return 0;
- }
-
- int num_links = xpathObj->nodesetval->nodeNr;
- *out_results = xml_result_alloc(num_links, max_results);
- if (!*out_results) {
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return 0;
- }
-
- for (int i = 0; i < num_links && found_count < max_results; i++) {
- xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i];
- char *title = xml_node_content(linkNode);
- char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href");
- char *snippet_text = NULL;
-
- xmlNodePtr current = linkNode->parent;
- while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
- current = current->parent;
-
- if (current && current->next) {
- xmlNodePtr snippetRow = current->next;
- while (snippetRow &&
- xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0)
- snippetRow = snippetRow->next;
- if (snippetRow) {
- xpathCtx->node = snippetRow;
- xmlXPathObjectPtr sObj =
- xml_xpath_eval(xpathCtx, ".//td[@class='result-snippet']");
- if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
- snippet_text = xml_node_content(sObj->nodesetval->nodeTab[0]);
- }
- if (sObj)
- xmlXPathFreeObject(sObj);
- xpathCtx->node = NULL;
- }
- }
-
- (*out_results)[found_count].url = unescape_search_url(url);
- (*out_results)[found_count].title = strdup(title ? title : "No Title");
- (*out_results)[found_count].snippet =
- strdup(snippet_text ? snippet_text : "");
- found_count++;
-
- if (title)
- xmlFree(title);
- if (url)
- xmlFree(url);
- if (snippet_text)
- xmlFree(snippet_text);
- }
-
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return found_count;
-}
-
-static int parse_startpage(const char *engine_name, xmlDocPtr doc,
- SearchResult **out_results, int max_results) {
- (void)engine_name;
- int found_count = 0;
-
- xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
- if (!xpathCtx) {
- return 0;
- }
-
- xmlXPathObjectPtr xpathObj =
- xml_xpath_eval(xpathCtx, "//div[contains(@class, 'result')]");
-
- if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
- if (xpathObj)
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return 0;
- }
-
- int num_results = xpathObj->nodesetval->nodeNr;
- *out_results = xml_result_alloc(num_results, max_results);
- if (!*out_results) {
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return 0;
- }
-
- for (int i = 0; i < num_results && found_count < max_results; i++) {
- xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
- xpathCtx->node = resultNode;
-
- xmlXPathObjectPtr linkObj =
- xml_xpath_eval(xpathCtx, ".//a[contains(@class, 'result-link')]");
- char *url =
- (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
- ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
- (xmlChar *)"href")
- : NULL;
-
- xmlXPathObjectPtr titleObj =
- xml_xpath_eval(xpathCtx, ".//h2[contains(@class, 'wgl-title')]");
- char *title =
- (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
- ? xml_node_content(titleObj->nodesetval->nodeTab[0])
- : NULL;
-
- xmlXPathObjectPtr snippetObj =
- xml_xpath_eval(xpathCtx, ".//p[contains(@class, 'description')]");
- char *snippet_text =
- (snippetObj && snippetObj->nodesetval &&
- snippetObj->nodesetval->nodeNr > 0)
- ? xml_node_content(snippetObj->nodesetval->nodeTab[0])
- : NULL;
-
- if (url && title) {
- (*out_results)[found_count].url = strdup(url);
- (*out_results)[found_count].title = strdup(title);
- (*out_results)[found_count].snippet =
- strdup(snippet_text ? snippet_text : "");
- found_count++;
- }
-
- if (title)
- xmlFree(title);
- if (url)
- xmlFree(url);
- if (snippet_text)
- xmlFree(snippet_text);
- if (linkObj)
- xmlXPathFreeObject(linkObj);
- if (titleObj)
- xmlXPathFreeObject(titleObj);
- if (snippetObj)
- xmlXPathFreeObject(snippetObj);
- }
-
- xpathCtx->node = NULL;
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return found_count;
-}
-
-static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
- SearchResult **out_results, int max_results) {
- (void)engine_name;
- int found_count = 0;
-
- xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
- if (!xpathCtx) {
- return 0;
- }
-
- xmlXPathObjectPtr xpathObj =
- xml_xpath_eval(xpathCtx, "//div[contains(@class, 'algo-sr')]");
-
- if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
- if (xpathObj)
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return 0;
- }
- int num_results = xpathObj->nodesetval->nodeNr;
- *out_results = xml_result_alloc(num_results, max_results);
- if (!*out_results) {
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
+int check_cache_for_job(ScrapeJob *job) {
+ if (get_cache_ttl_search() <= 0)
return 0;
- }
-
- for (int i = 0; i < num_results && found_count < max_results; i++) {
- xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
- xpathCtx->node = resultNode;
-
- xmlXPathObjectPtr linkObj = xml_xpath_eval(
- xpathCtx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']");
- char *url =
- (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
- ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
- (xmlChar *)"href")
- : NULL;
-
- xmlXPathObjectPtr titleObj =
- xml_xpath_eval(xpathCtx, ".//h3[contains(@class, 'title')]");
- char *title =
- (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
- ? xml_node_content(titleObj->nodesetval->nodeTab[0])
- : NULL;
-
- xmlXPathObjectPtr snippetObj =
- xml_xpath_eval(xpathCtx, ".//div[contains(@class, 'compText')]//p");
- char *snippet_text =
- (snippetObj && snippetObj->nodesetval &&
- snippetObj->nodesetval->nodeNr > 0)
- ? xml_node_content(snippetObj->nodesetval->nodeTab[0])
- : NULL;
-
- if (url && title) {
- (*out_results)[found_count].url = unescape_search_url(url);
- (*out_results)[found_count].title = strdup(title);
- (*out_results)[found_count].snippet =
- strdup(snippet_text ? snippet_text : "");
- found_count++;
- }
-
- if (title)
- xmlFree(title);
- if (url)
- xmlFree(url);
- if (snippet_text)
- xmlFree(snippet_text);
- if (linkObj)
- xmlXPathFreeObject(linkObj);
- if (titleObj)
- xmlXPathFreeObject(titleObj);
- if (snippetObj)
- xmlXPathFreeObject(snippetObj);
- }
-
- xpathCtx->node = NULL;
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return found_count;
-}
-
-const SearchEngine ENGINE_REGISTRY[] = {
- {.name = "DuckDuckGo Lite",
- .base_url = "https://lite.duckduckgo.com/lite/?q=",
- .host_header = "lite.duckduckgo.com",
- .referer = "https://lite.duckduckgo.com/",
- .page_param = "s",
- .page_multiplier = 30,
- .page_base = 0,
- .parser = parse_ddg_lite},
- {.name = "Startpage",
- .base_url = "https://www.startpage.com/sp/search?q=",
- .host_header = "www.startpage.com",
- .referer = "https://www.startpage.com/",
- .page_param = "page",
- .page_multiplier = 1,
- .page_base = 1,
- .parser = parse_startpage},
- {.name = "Yahoo",
- .base_url = "https://search.yahoo.com/search?p=",
- .host_header = "search.yahoo.com",
- .referer = "https://search.yahoo.com/",
- .page_param = "b",
- .page_multiplier = 10,
- .page_base = 1,
- .parser = parse_yahoo}};
-
-const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
-
-#define CURL_TIMEOUT 15L
-#define CURL_DNS_TIMEOUT 300L
-
-static void configure_curl_handle(CURL *curl, const char *full_url,
- MemoryBuffer *chunk,
- struct curl_slist *headers) {
- curl_easy_setopt(curl, CURLOPT_URL, full_url);
- curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
- curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
- curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
- curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
-
- curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
- curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
- curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT);
- curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
- curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT);
- curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
- curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
-
- apply_proxy_settings(curl);
-}
-
-static char *build_search_url(const char *base_url, const char *page_param,
- int page_multiplier, int page_base,
- const char *encoded_query, int page) {
- int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base;
- char *url = malloc(BUFFER_SIZE_LARGE);
- if (!url) {
- return NULL;
- }
- snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query,
- page_param, page_value);
- return url;
-}
-
-static struct curl_slist *build_request_headers(const char *host_header,
- const char *referer) {
- struct curl_slist *headers = NULL;
- char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM];
-
- snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header);
- snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer);
-
- headers = curl_slist_append(headers, host_buf);
- headers = curl_slist_append(headers, ref_buf);
- headers = curl_slist_append(
- headers,
- "Accept: "
- "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
- headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
- headers = curl_slist_append(headers, "DNT: 1");
-
- return headers;
-}
-
-static int check_cache_for_job(ScrapeJob *job) {
- if (get_cache_ttl_search() <= 0) {
- return 0;
- }
char *key = cache_compute_key(job->query, job->page, job->engine->name);
- if (!key) {
+ if (!key)
return 0;
- }
char *cached_data = NULL;
size_t cached_size = 0;
@@ -414,27 +39,31 @@ static int check_cache_for_job(ScrapeJob *job) {
return 0;
}
-static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
- if (msg->data.result == CURLE_OK && job->response.size > 0) {
- char *key = cache_compute_key(job->query, job->page, job->engine->name);
- if (key && get_cache_ttl_search() > 0) {
- cache_set(key, job->response.memory, job->response.size);
- free(key);
- }
+void parse_and_cache_response(ScrapeJob *job) {
+ if (job->response.size == 0) {
+ job->results_count = 0;
+ return;
+ }
- xmlDocPtr doc = htmlReadMemory(
- job->response.memory, job->response.size, NULL, NULL,
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+ char *key = cache_compute_key(job->query, job->page, job->engine->name);
+ if (key && get_cache_ttl_search() > 0)
+ cache_set(key, job->response.memory, job->response.size);
+ free(key);
- if (doc) {
- job->results_count = job->engine->parser(
- job->engine->name, doc, job->out_results, job->max_results);
- xmlFreeDoc(doc);
- }
+ xmlDocPtr doc = htmlReadMemory(
+ job->response.memory, job->response.size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+
+ if (doc) {
+ job->results_count = job->engine->parser(
+ job->engine->name, doc, job->out_results, job->max_results);
+ xmlFreeDoc(doc);
} else {
job->results_count = 0;
}
+}
+void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
struct curl_slist *headers = NULL;
curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
if (headers)
@@ -444,67 +73,112 @@ static void process_job_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
job->response.memory = NULL;
}
-int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
- int retries = 0;
+void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
+ if (msg->data.result == CURLE_OK)
+ parse_and_cache_response(job);
+ else
+ job->results_count = 0;
-retry:
- CURLM *multi_handle = curl_multi_init();
- if (!multi_handle) {
+ cleanup_job_handle(job, handle);
+}
+
+int setup_job(ScrapeJob *job, CURLM *multi_handle) {
+ if (job->handle)
+ curl_easy_cleanup(job->handle);
+ if (job->response.memory)
+ free(job->response.memory);
+
+ if (check_cache_for_job(job)) {
+ job->results_count = job->results_count > 0 ? job->results_count : 0;
+ return 0;
+ }
+
+ char *encoded_query = curl_easy_escape(NULL, job->query, 0);
+ if (!encoded_query)
+ return -1;
+
+ char *full_url =
+ build_search_url(job->engine->base_url, job->engine->page_param,
+ job->engine->page_multiplier, job->engine->page_base,
+ encoded_query, job->page);
+ free(encoded_query);
+
+ if (!full_url)
+ return -1;
+
+ job->handle = curl_easy_init();
+ if (!job->handle) {
+ free(full_url);
return -1;
}
- for (int i = 0; i < num_jobs; i++) {
- ScrapeJob *job = &jobs[i];
+ job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE);
+ job->response.size = 0;
+ job->response.capacity = INITIAL_BUFFER_SIZE;
- if (job->handle) {
- curl_easy_cleanup(job->handle);
- job->handle = NULL;
- }
- if (job->response.memory) {
- free(job->response.memory);
- }
+ struct curl_slist *headers =
+ build_request_headers(job->engine->host_header, job->engine->referer);
- if (check_cache_for_job(job)) {
- job->results_count = job->results_count > 0 ? job->results_count : 0;
- continue;
- }
+ configure_curl_handle(job->handle, full_url, &job->response, headers);
+ curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
- char *encoded_query = curl_easy_escape(NULL, job->query, 0);
- if (!encoded_query) {
- continue;
- }
+ free(full_url);
+ curl_multi_add_handle(multi_handle, job->handle);
+ return 0;
+}
- char *full_url =
- build_search_url(job->engine->base_url, job->engine->page_param,
- job->engine->page_multiplier, job->engine->page_base,
- encoded_query, job->page);
- free(encoded_query);
+int handle_responses(CURLM *multi_handle, ScrapeJob *jobs, int num_jobs) {
+ CURLMsg *msg;
+ int msgs_left;
- if (!full_url) {
+ while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
+ if (msg->msg != CURLMSG_DONE)
continue;
- }
- job->handle = curl_easy_init();
- if (!job->handle) {
- free(full_url);
- continue;
+ CURL *handle = msg->easy_handle;
+
+ for (int i = 0; i < num_jobs; i++) {
+ if (jobs[i].handle && jobs[i].handle == handle) {
+ process_response(&jobs[i], handle, msg);
+ curl_multi_remove_handle(multi_handle, handle);
+ curl_easy_cleanup(handle);
+ jobs[i].handle = NULL;
+ break;
+ }
}
+ }
- job->response.memory = (char *)malloc(INITIAL_BUFFER_SIZE);
- job->response.size = 0;
- job->response.capacity = INITIAL_BUFFER_SIZE;
+ return 0;
+}
- struct curl_slist *headers =
- build_request_headers(job->engine->host_header, job->engine->referer);
+int should_retry(ScrapeJob *jobs, int num_jobs) {
+ if (proxy_count <= 0)
+ return 0;
- configure_curl_handle(job->handle, full_url, &job->response, headers);
- curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
+ for (int i = 0; i < num_jobs; i++) {
+ if (jobs[i].results_count == 0 && jobs[i].response.size == 0)
+ return 1;
+ }
+ return 0;
+}
- free(full_url);
- curl_multi_add_handle(multi_handle, job->handle);
+int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
+ int retries = 0;
+
+retry:
+ CURLM *multi_handle = curl_multi_init();
+ if (!multi_handle)
+ return -1;
+
+ for (int i = 0; i < num_jobs; i++) {
+ if (setup_job(&jobs[i], multi_handle) != 0 && jobs[i].handle) {
+ curl_multi_remove_handle(multi_handle, jobs[i].handle);
+ curl_easy_cleanup(jobs[i].handle);
+ jobs[i].handle = NULL;
+ }
}
- usleep(100000 + (rand() % 100000));
+ http_delay();
int still_running = 0;
curl_multi_perform(multi_handle, &still_running);
@@ -512,50 +186,17 @@ retry:
do {
int numfds = 0;
CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
-
- if (mc != CURLM_OK) {
+ if (mc != CURLM_OK)
break;
- }
-
curl_multi_perform(multi_handle, &still_running);
} while (still_running);
- CURLMsg *msg;
- int msgs_left;
- while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
- if (msg->msg == CURLMSG_DONE) {
- CURL *handle = msg->easy_handle;
-
- for (int i = 0; i < num_jobs; i++) {
- if (jobs[i].handle && jobs[i].handle == handle) {
- ScrapeJob *job = &jobs[i];
-
- process_job_response(job, handle, msg);
-
- curl_multi_remove_handle(multi_handle, handle);
- if (handle)
- curl_easy_cleanup(handle);
- job->handle = NULL;
- break;
- }
- }
- }
- }
-
+ handle_responses(multi_handle, jobs, num_jobs);
curl_multi_cleanup(multi_handle);
- if (retries < max_proxy_retries && proxy_count > 0) {
- int any_failed = 0;
- for (int i = 0; i < num_jobs; i++) {
- if (jobs[i].results_count == 0 && jobs[i].response.size == 0) {
- any_failed = 1;
- break;
- }
- }
- if (any_failed) {
- retries++;
- goto retry;
- }
+ if (retries < max_proxy_retries && should_retry(jobs, num_jobs)) {
+ retries++;
+ goto retry;
}
return 0;
diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h
index f1ad2c4..1439118 100644
--- a/src/Scraping/Scraping.h
+++ b/src/Scraping/Scraping.h
@@ -3,6 +3,7 @@
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
+#include <libxml/xpath.h>
typedef struct {
char *url;
@@ -45,6 +46,25 @@ typedef struct {
extern const SearchEngine ENGINE_REGISTRY[];
extern const int ENGINE_COUNT;
+size_t write_memory_callback(void *contents, size_t size, size_t nmemb,
+ void *userp);
+const char *get_random_user_agent(void);
+void configure_curl_handle(CURL *curl, const char *full_url,
+ MemoryBuffer *chunk, struct curl_slist *headers);
+char *build_search_url(const char *base_url, const char *page_param,
+ int page_multiplier, int page_base,
+ const char *encoded_query, int page);
+struct curl_slist *build_request_headers(const char *host_header,
+ const char *referer);
+void http_delay(void);
+
+xmlXPathContextPtr create_xpath_context(xmlDocPtr doc);
+void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj);
+SearchResult *alloc_results_array(int capacity, int max_results);
+void assign_result(SearchResult *result, char *url, char *title, char *snippet,
+ int unescape);
+void free_xml_node_list(char *title, char *url, char *snippet);
+
int scrape_engine(const SearchEngine *engine, const char *query,
SearchResult **out_results, int max_results);
diff --git a/src/Scraping/ScrapingHttp.c b/src/Scraping/ScrapingHttp.c
new file mode 100644
index 0000000..1a6a292
--- /dev/null
+++ b/src/Scraping/ScrapingHttp.c
@@ -0,0 +1,109 @@
+#include "../Proxy/Proxy.h"
+#include "Config.h"
+#include "Scraping.h"
+#include <curl/curl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define HTTP_DELAY_MIN_US 100000
+#define HTTP_DELAY_RANGE_US 100000
+
+static const char *USER_AGENTS[] = {
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
+ "like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
+ "Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
+ "Firefox/121.0",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
+ "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
+
+#define USER_AGENT_COUNT (sizeof(USER_AGENTS) / sizeof(USER_AGENTS[0]))
+
+size_t write_memory_callback(void *contents, size_t size, size_t nmemb,
+ void *userp) {
+ size_t realsize = size * nmemb;
+ MemoryBuffer *mem = (MemoryBuffer *)userp;
+
+ if (mem->size + realsize + 1 > mem->capacity) {
+ size_t new_cap =
+ mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2;
+ while (new_cap < mem->size + realsize + 1)
+ new_cap *= 2;
+
+ char *ptr = (char *)realloc(mem->memory, new_cap);
+ if (!ptr)
+ return 0;
+ mem->memory = ptr;
+ mem->capacity = new_cap;
+ }
+
+ memcpy(&(mem->memory[mem->size]), contents, realsize);
+ mem->size += realsize;
+ mem->memory[mem->size] = 0;
+
+ return realsize;
+}
+
+const char *get_random_user_agent(void) {
+ return USER_AGENTS[rand() % USER_AGENT_COUNT];
+}
+
+void configure_curl_handle(CURL *curl, const char *full_url,
+ MemoryBuffer *chunk, struct curl_slist *headers) {
+ curl_easy_setopt(curl, CURLOPT_URL, full_url);
+ curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_memory_callback);
+ curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
+ curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
+
+ curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+ curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
+ curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT_SECS);
+ curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+ curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT_SECS);
+ curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
+ curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
+
+ apply_proxy_settings(curl);
+}
+
+char *build_search_url(const char *base_url, const char *page_param,
+ int page_multiplier, int page_base,
+ const char *encoded_query, int page) {
+ int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base;
+ char *url = malloc(BUFFER_SIZE_LARGE);
+ if (!url)
+ return NULL;
+ snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query,
+ page_param, page_value);
+ return url;
+}
+
+struct curl_slist *build_request_headers(const char *host_header,
+ const char *referer) {
+ struct curl_slist *headers = NULL;
+ char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM];
+
+ snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header);
+ snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer);
+
+ headers = curl_slist_append(headers, host_buf);
+ headers = curl_slist_append(headers, ref_buf);
+ headers = curl_slist_append(
+ headers,
+ "Accept: "
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
+ headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
+ headers = curl_slist_append(headers, "DNT: 1");
+
+ return headers;
+}
+
+void http_delay(void) {
+ usleep(HTTP_DELAY_MIN_US + (rand() % HTTP_DELAY_RANGE_US));
+}
diff --git a/src/Scraping/ScrapingParsers.c b/src/Scraping/ScrapingParsers.c
new file mode 100644
index 0000000..818d333
--- /dev/null
+++ b/src/Scraping/ScrapingParsers.c
@@ -0,0 +1,269 @@
+#include "../Utility/Unescape.h"
+#include "../Utility/XmlHelper.h"
+#include "Config.h"
+#include "Scraping.h"
+#include <libxml/HTMLparser.h>
+#include <libxml/xpath.h>
+#include <stdlib.h>
+#include <string.h>
+
+xmlXPathContextPtr create_xpath_context(xmlDocPtr doc) {
+ return xmlXPathNewContext(doc);
+}
+
+void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj) {
+ if (obj)
+ xmlXPathFreeObject(obj);
+ if (ctx)
+ xmlXPathFreeContext(ctx);
+}
+
+SearchResult *alloc_results_array(int capacity, int max_results) {
+ int count = capacity < max_results ? capacity : max_results;
+ return xml_result_alloc(capacity, count);
+}
+
+void assign_result(SearchResult *result, char *url, char *title, char *snippet,
+ int unescape) {
+ result->url = unescape ? unescape_search_url(url) : strdup(url ? url : "");
+ result->title = strdup(title ? title : "No Title");
+ result->snippet = strdup(snippet ? snippet : "");
+}
+
+void free_xml_node_list(char *title, char *url, char *snippet) {
+ if (title)
+ xmlFree(title);
+ if (url)
+ xmlFree(url);
+ if (snippet)
+ xmlFree(snippet);
+}
+
+static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results) {
+ (void)engine_name;
+ int found_count = 0;
+
+ xmlXPathContextPtr ctx = create_xpath_context(doc);
+ if (!ctx)
+ return 0;
+
+ xmlXPathObjectPtr obj =
+ xml_xpath_eval(ctx, "//tr[not(contains(@class, "
+ "'result-sponsored'))]//a[@class='result-link']");
+
+ if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
+ free_xpath_objects(ctx, obj);
+ return 0;
+ }
+
+ int num_links = obj->nodesetval->nodeNr;
+ *out_results = alloc_results_array(num_links, max_results);
+ if (!*out_results) {
+ free_xpath_objects(ctx, obj);
+ return 0;
+ }
+
+ for (int i = 0; i < num_links && found_count < max_results; i++) {
+ xmlNodePtr link_node = obj->nodesetval->nodeTab[i];
+ char *title = xml_node_content(link_node);
+ char *url = (char *)xmlGetProp(link_node, (xmlChar *)"href");
+ char *snippet_text = NULL;
+
+ xmlNodePtr current = link_node->parent;
+ while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
+ current = current->parent;
+
+ if (current && current->next) {
+ xmlNodePtr snippet_row = current->next;
+ while (snippet_row &&
+ xmlStrcasecmp(snippet_row->name, (const xmlChar *)"tr") != 0)
+ snippet_row = snippet_row->next;
+ if (snippet_row) {
+ ctx->node = snippet_row;
+ xmlXPathObjectPtr s_obj =
+ xml_xpath_eval(ctx, ".//td[@class='result-snippet']");
+ if (s_obj && s_obj->nodesetval && s_obj->nodesetval->nodeNr > 0)
+ snippet_text = xml_node_content(s_obj->nodesetval->nodeTab[0]);
+ if (s_obj)
+ xmlXPathFreeObject(s_obj);
+ ctx->node = NULL;
+ }
+ }
+
+ assign_result(&(*out_results)[found_count], url, title, snippet_text, 1);
+ free_xml_node_list(title, url, snippet_text);
+ found_count++;
+ }
+
+ free_xpath_objects(ctx, obj);
+ return found_count;
+}
+
+static int parse_startpage(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results) {
+ (void)engine_name;
+ int found_count = 0;
+
+ xmlXPathContextPtr ctx = create_xpath_context(doc);
+ if (!ctx)
+ return 0;
+
+ xmlXPathObjectPtr obj =
+ xml_xpath_eval(ctx, "//div[contains(@class, 'result')]");
+
+ if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
+ free_xpath_objects(ctx, obj);
+ return 0;
+ }
+
+ int num_results = obj->nodesetval->nodeNr;
+ *out_results = alloc_results_array(num_results, max_results);
+ if (!*out_results) {
+ free_xpath_objects(ctx, obj);
+ return 0;
+ }
+
+ for (int i = 0; i < num_results && found_count < max_results; i++) {
+ xmlNodePtr result_node = obj->nodesetval->nodeTab[i];
+ ctx->node = result_node;
+
+ xmlXPathObjectPtr link_obj =
+ xml_xpath_eval(ctx, ".//a[contains(@class, 'result-link')]");
+ char *url =
+ (link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0)
+ ? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0],
+ (xmlChar *)"href")
+ : NULL;
+
+ xmlXPathObjectPtr title_obj =
+ xml_xpath_eval(ctx, ".//h2[contains(@class, 'wgl-title')]");
+ char *title = (title_obj && title_obj->nodesetval &&
+ title_obj->nodesetval->nodeNr > 0)
+ ? xml_node_content(title_obj->nodesetval->nodeTab[0])
+ : NULL;
+
+ xmlXPathObjectPtr snippet_obj =
+ xml_xpath_eval(ctx, ".//p[contains(@class, 'description')]");
+ char *snippet_text =
+ (snippet_obj && snippet_obj->nodesetval &&
+ snippet_obj->nodesetval->nodeNr > 0)
+ ? xml_node_content(snippet_obj->nodesetval->nodeTab[0])
+ : NULL;
+
+ if (url && title) {
+ assign_result(&(*out_results)[found_count], url, title, snippet_text, 0);
+ found_count++;
+ }
+
+ free_xml_node_list(title, url, snippet_text);
+ if (link_obj)
+ xmlXPathFreeObject(link_obj);
+ if (title_obj)
+ xmlXPathFreeObject(title_obj);
+ if (snippet_obj)
+ xmlXPathFreeObject(snippet_obj);
+ }
+
+ ctx->node = NULL;
+ free_xpath_objects(ctx, obj);
+ return found_count;
+}
+
+static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results) {
+ (void)engine_name;
+ int found_count = 0;
+
+ xmlXPathContextPtr ctx = create_xpath_context(doc);
+ if (!ctx)
+ return 0;
+
+ xmlXPathObjectPtr obj =
+ xml_xpath_eval(ctx, "//div[contains(@class, 'algo-sr')]");
+
+ if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
+ free_xpath_objects(ctx, obj);
+ return 0;
+ }
+
+ int num_results = obj->nodesetval->nodeNr;
+ *out_results = alloc_results_array(num_results, max_results);
+ if (!*out_results) {
+ free_xpath_objects(ctx, obj);
+ return 0;
+ }
+
+ for (int i = 0; i < num_results && found_count < max_results; i++) {
+ xmlNodePtr result_node = obj->nodesetval->nodeTab[i];
+ ctx->node = result_node;
+
+ xmlXPathObjectPtr link_obj = xml_xpath_eval(
+ ctx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']");
+ char *url =
+ (link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0)
+ ? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0],
+ (xmlChar *)"href")
+ : NULL;
+
+ xmlXPathObjectPtr title_obj =
+ xml_xpath_eval(ctx, ".//h3[contains(@class, 'title')]");
+ char *title = (title_obj && title_obj->nodesetval &&
+ title_obj->nodesetval->nodeNr > 0)
+ ? xml_node_content(title_obj->nodesetval->nodeTab[0])
+ : NULL;
+
+ xmlXPathObjectPtr snippet_obj =
+ xml_xpath_eval(ctx, ".//div[contains(@class, 'compText')]//p");
+ char *snippet_text =
+ (snippet_obj && snippet_obj->nodesetval &&
+ snippet_obj->nodesetval->nodeNr > 0)
+ ? xml_node_content(snippet_obj->nodesetval->nodeTab[0])
+ : NULL;
+
+ if (url && title) {
+ assign_result(&(*out_results)[found_count], url, title, snippet_text, 1);
+ found_count++;
+ }
+
+ free_xml_node_list(title, url, snippet_text);
+ if (link_obj)
+ xmlXPathFreeObject(link_obj);
+ if (title_obj)
+ xmlXPathFreeObject(title_obj);
+ if (snippet_obj)
+ xmlXPathFreeObject(snippet_obj);
+ }
+
+ ctx->node = NULL;
+ free_xpath_objects(ctx, obj);
+ return found_count;
+}
+
+const SearchEngine ENGINE_REGISTRY[] = {
+ {.name = "DuckDuckGo Lite",
+ .base_url = "https://lite.duckduckgo.com/lite/?q=",
+ .host_header = "lite.duckduckgo.com",
+ .referer = "https://lite.duckduckgo.com/",
+ .page_param = "s",
+ .page_multiplier = 30,
+ .page_base = 0,
+ .parser = parse_ddg_lite},
+ {.name = "Startpage",
+ .base_url = "https://www.startpage.com/sp/search?q=",
+ .host_header = "www.startpage.com",
+ .referer = "https://www.startpage.com/",
+ .page_param = "page",
+ .page_multiplier = 1,
+ .page_base = 1,
+ .parser = parse_startpage},
+ {.name = "Yahoo",
+ .base_url = "https://search.yahoo.com/search?p=",
+ .host_header = "search.yahoo.com",
+ .referer = "https://search.yahoo.com/",
+ .page_param = "b",
+ .page_multiplier = 10,
+ .page_base = 1,
+ .parser = parse_yahoo}};
+
+const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);