diff options
| author | frosty <frosty@illegalfirearms.store> | 2026-01-06 23:46:24 -0500 |
|---|---|---|
| committer | frosty <frosty@illegalfirearms.store> | 2026-01-06 23:46:24 -0500 |
| commit | f3aa7ca0bc2ef7c286609e8f87d07cc2568093af (patch) | |
| tree | 269352af1238b4dd7c3e2e023f71a27b858cdb34 /src/Scraping | |
rebase(d)
Diffstat (limited to 'src/Scraping')
| -rw-r--r-- | src/Scraping/Scraping.c | 468 | ||||
| -rw-r--r-- | src/Scraping/Scraping.h | 34 |
2 files changed, 502 insertions, 0 deletions
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c new file mode 100644 index 0000000..d2afea6 --- /dev/null +++ b/src/Scraping/Scraping.c @@ -0,0 +1,468 @@ +#include "Scraping.h" +#include <curl/curl.h> +#include <libxml/HTMLparser.h> +#include <libxml/xpath.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +typedef struct { + char *memory; + size_t size; +} MemoryBuffer; + +static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + MemoryBuffer *mem = (MemoryBuffer *)userp; + + char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1); + if (ptr == NULL) { + LOG_ERROR("Not enough memory (realloc returned NULL)"); + return 0; + } + + mem->memory = ptr; + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + + return realsize; +} + +static const char *get_random_user_agent() { + static const char *agents[] = { + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " + "like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " + "Gecko) " + "Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 " + "Firefox/121.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"}; + return agents[rand() % 5]; +} + +static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + LOG_DEBUG("[%s] Starting XPath parsing...", engine_name); + int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + LOG_ERROR("[%s] Could not create XPath context", engine_name); + return 0; + } + + const char *link_xpath = "//a[@class='result-link']"; + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { + LOG_WARN("[%s] No results found with XPath: %s", engine_name, link_xpath); + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + int num_links = xpathObj->nodesetval->nodeNr; + LOG_INFO("[%s] XPath matched %d potential result links", engine_name, + num_links); + + int actual_alloc = (num_links < max_results) ? num_links : max_results; + *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + if (!*out_results) { + LOG_ERROR("[%s] Failed to allocate memory for results", engine_name); + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + for (int i = 0; i < num_links && found_count < max_results; i++) { + xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i]; + char *title = (char *)xmlNodeGetContent(linkNode); + char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href"); + char *snippet_text = NULL; + + xmlNodePtr current = linkNode->parent; + while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0) + current = current->parent; + + if (current && current->next) { + xmlNodePtr snippetRow = current->next; + while (snippetRow && + xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) + snippetRow = snippetRow->next; + if (snippetRow) { + xmlXPathContextPtr subCtx = xmlXPathNewContext(doc); + if (subCtx) { + subCtx->node = snippetRow; + xmlXPathObjectPtr sObj = xmlXPathEvalExpression( + (xmlChar *)".//td[@class='result-snippet']", subCtx); + if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { + snippet_text = + (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); + } + if (sObj) xmlXPathFreeObject(sObj); + xmlXPathFreeContext(subCtx); + } + } + } + + (*out_results)[found_count].url = strdup(url ? url : ""); + (*out_results)[found_count].title = strdup(title ? title : "No Title"); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, + (*out_results)[found_count].title); + found_count++; + + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + } + + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return found_count; +} + +static int parse_startpage(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + LOG_DEBUG("[%s] Starting XPath parsing...", engine_name); + int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + LOG_ERROR("[%s] Could not create XPath context", engine_name); + return 0; + } + + const char *container_xpath = "//div[contains(@class, 'result')]"; + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { + LOG_WARN("[%s] No result containers found with XPath: %s", engine_name, + container_xpath); + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + int num_results = xpathObj->nodesetval->nodeNr; + LOG_INFO("[%s] Found %d result containers", engine_name, num_results); + + int actual_alloc = (num_results < max_results) ? num_results : max_results; + *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + if (!*out_results) { + LOG_ERROR("[%s] Failed to allocate memory for results", engine_name); + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; + xmlXPathContextPtr resCtx = xmlXPathNewContext(doc); + if (!resCtx) { + LOG_ERROR("[%s] Failed to create result context for item %d", engine_name, + i); + continue; + } + resCtx->node = resultNode; + + xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( + (xmlChar *)".//a[contains(@class, 'result-link')]", resCtx); + char *url = + (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( + (xmlChar *)".//h2[contains(@class, 'wgl-title')]", resCtx); + char *title = + (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( + (xmlChar *)".//p[contains(@class, 'description')]", resCtx); + char *snippet_text = + (snippetObj && snippetObj->nodesetval && + snippetObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + (*out_results)[found_count].url = strdup(url); + (*out_results)[found_count].title = strdup(title); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, + title); + found_count++; + } else { + LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s", + engine_name, i, url ? "Yes" : "No", title ? "Yes" : "No"); + } + + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + if (linkObj) xmlXPathFreeObject(linkObj); + if (titleObj) xmlXPathFreeObject(titleObj); + if (snippetObj) xmlXPathFreeObject(snippetObj); + xmlXPathFreeContext(resCtx); + } + + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return found_count; +} + +static int parse_yahoo(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + LOG_DEBUG("[%s] Starting XPath parsing...", engine_name); + int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + LOG_ERROR("[%s] Could not create XPath context", engine_name); + return 0; + } + + const char *container_xpath = "//div[contains(@class, 'algo-sr')]"; + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { + LOG_WARN("[%s] No result containers found with XPath: %s", engine_name, + container_xpath); + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + int num_results = xpathObj->nodesetval->nodeNr; + LOG_INFO("[%s] Found %d result containers", engine_name, num_results); + + int actual_alloc = (num_results < max_results) ? num_results : max_results; + *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + if (!*out_results) { + LOG_ERROR("[%s] Failed to allocate memory for results", engine_name); + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; + xmlXPathContextPtr resCtx = xmlXPathNewContext(doc); + if (!resCtx) { + LOG_ERROR("[%s] Failed to create result context for item %d", engine_name, + i); + continue; + } + resCtx->node = resultNode; + + xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( + (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']", + resCtx); + char *url = + (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( + (xmlChar *)".//h3[contains(@class, 'title')]", resCtx); + char *title = + (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( + (xmlChar *)".//div[contains(@class, 'compText')]//p", resCtx); + char *snippet_text = + (snippetObj && snippetObj->nodesetval && + snippetObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + : NULL; + + if (!url || !title) { + LOG_DEBUG("[%s] Container %d debug - URL: %s, Title: %s", engine_name, i, + url ? url : "(null)", title ? title : "(null)"); + } + + if (url && title) { + (*out_results)[found_count].url = strdup(url); + (*out_results)[found_count].title = strdup(title); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, + title); + found_count++; + } else { + LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s", + engine_name, i, url ? "Yes" : "No", title ? "Yes" : "No"); + } + + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + if (linkObj) xmlXPathFreeObject(linkObj); + if (titleObj) xmlXPathFreeObject(titleObj); + if (snippetObj) xmlXPathFreeObject(snippetObj); + xmlXPathFreeContext(resCtx); + } + + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return found_count; +} + +const SearchEngine ENGINE_REGISTRY[] = { + {.name = "DuckDuckGo Lite", + .base_url = "https://lite.duckduckgo.com/lite/?q=", + .host_header = "lite.duckduckgo.com", + .referer = "https://lite.duckduckgo.com/", + .parser = parse_ddg_lite}, + {.name = "Startpage", + .base_url = "https://www.startpage.com/sp/search?q=", + .host_header = "www.startpage.com", + .referer = "https://www.startpage.com/", + .parser = parse_startpage}, + {.name = "Yahoo", + .base_url = "https://search.yahoo.com/search?p=", + .host_header = "search.yahoo.com", + .referer = "https://search.yahoo.com/", + .parser = parse_yahoo}}; + +const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine); + +int scrape_engine(const SearchEngine *engine, const char *query, + SearchResult **out_results, int max_results) { + CURL *curl; + MemoryBuffer chunk = {.memory = (char *)malloc(1), .size = 0}; + int results_count = 0; + + LOG_INFO("--- Starting scrape for engine: %s ---", engine->name); + LOG_INFO("[%s] Query: '%s'", engine->name, query); + + if (!chunk.memory) { + LOG_ERROR("Initial memory allocation failed"); + return -1; + } + + curl = curl_easy_init(); + + if (curl && query) { + char full_url[1024]; + char *encoded_query = curl_easy_escape(curl, query, 0); + if (!encoded_query) { + LOG_ERROR("[%s] Failed to encode query", engine->name); + curl_easy_cleanup(curl); + free(chunk.memory); + return -1; + } + snprintf(full_url, sizeof(full_url), "%s%s", engine->base_url, + encoded_query); + curl_free(encoded_query); + + LOG_DEBUG("[%s] Requesting URL: %s", engine->name, full_url); + + struct curl_slist *headers = NULL; + char host_buf[256], ref_buf[256]; + snprintf(host_buf, sizeof(host_buf), "Host: %s", engine->host_header); + snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", engine->referer); + + headers = curl_slist_append(headers, host_buf); + headers = curl_slist_append(headers, ref_buf); + headers = curl_slist_append(headers, + "Accept: " + "text/html,application/xhtml+xml,application/" + "xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); + headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); + headers = curl_slist_append(headers, "DNT: 1"); + headers = curl_slist_append(headers, "Upgrade-Insecure-Requests: 1"); + headers = curl_slist_append(headers, "Sec-Fetch-Dest: document"); + headers = curl_slist_append(headers, "Sec-Fetch-Mode: navigate"); + headers = curl_slist_append(headers, "Sec-Fetch-Site: same-origin"); + headers = curl_slist_append(headers, "Connection: keep-alive"); + + curl_easy_setopt(curl, CURLOPT_URL, full_url); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk); + + const char *ua = get_random_user_agent(); + LOG_DEBUG("[%s] Using User-Agent: %s", engine->name, ua); + curl_easy_setopt(curl, CURLOPT_USERAGENT, ua); + + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); + + curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); + + LOG_DEBUG("[%s] Waiting for rate-limit jitter...", engine->name); + usleep(500000 + (rand() % 1000000)); + + CURLcode res = curl_easy_perform(curl); + + if (res != CURLE_OK) { + LOG_ERROR("[%s] libcurl error: %s", engine->name, + curl_easy_strerror(res)); + } else { + long response_code; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); + LOG_INFO("[%s] HTTP Response Code: %ld", engine->name, response_code); + + if (chunk.size > 0) { + xmlDocPtr doc = htmlReadMemory( + chunk.memory, chunk.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + if (doc) { + results_count = + engine->parser(engine->name, doc, out_results, max_results); + xmlFreeDoc(doc); + } + } + } + + if (results_count <= 0) { + LOG_WARN("[%s] No results found. Generating skeleton fallback.", + engine->name); + *out_results = (SearchResult *)malloc(sizeof(SearchResult)); + if (*out_results) { + char fallback_msg[512]; + snprintf(fallback_msg, sizeof(fallback_msg), + "Search %s manually for '%s'", engine->name, query); + + (*out_results)[0].title = strdup(fallback_msg); + (*out_results)[0].url = strdup(full_url); + (*out_results)[0].snippet = strdup( + "Automated results were blocked by a Captcha or anti-bot " + "challenge. Click the link above to perform the search " + "manually in your browser."); + results_count = 1; + } + } + + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + } else { + if (curl) { + curl_easy_cleanup(curl); + } + } + + free(chunk.memory); + + return results_count; +} diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h new file mode 100644 index 0000000..7ad4d59 --- /dev/null +++ b/src/Scraping/Scraping.h @@ -0,0 +1,34 @@ +#ifndef SCRAPING_H +#define SCRAPING_H + +#include <libxml/HTMLparser.h> + +#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__) +#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__) +#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__) +#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__) + +typedef struct { + char *url; + char *title; + char *snippet; +} SearchResult; + +typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results); + +typedef struct { + const char *name; + const char *base_url; + const char *host_header; + const char *referer; + ParserFunc parser; +} SearchEngine; + +extern const SearchEngine ENGINE_REGISTRY[]; +extern const int ENGINE_COUNT; + +int scrape_engine(const SearchEngine *engine, const char *query, + SearchResult **out_results, int max_results); + +#endif |
