diff options
Diffstat (limited to 'src/Scraping/ScrapingParsers.c')
| -rw-r--r-- | src/Scraping/ScrapingParsers.c | 269 |
1 files changed, 269 insertions, 0 deletions
diff --git a/src/Scraping/ScrapingParsers.c b/src/Scraping/ScrapingParsers.c new file mode 100644 index 0000000..818d333 --- /dev/null +++ b/src/Scraping/ScrapingParsers.c @@ -0,0 +1,269 @@ +#include "../Utility/Unescape.h" +#include "../Utility/XmlHelper.h" +#include "Config.h" +#include "Scraping.h" +#include <libxml/HTMLparser.h> +#include <libxml/xpath.h> +#include <stdlib.h> +#include <string.h> + +xmlXPathContextPtr create_xpath_context(xmlDocPtr doc) { + return xmlXPathNewContext(doc); +} + +void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj) { + if (obj) + xmlXPathFreeObject(obj); + if (ctx) + xmlXPathFreeContext(ctx); +} + +SearchResult *alloc_results_array(int capacity, int max_results) { + int count = capacity < max_results ? capacity : max_results; + return xml_result_alloc(capacity, count); +} + +void assign_result(SearchResult *result, char *url, char *title, char *snippet, + int unescape) { + result->url = unescape ? unescape_search_url(url) : strdup(url ? url : ""); + result->title = strdup(title ? title : "No Title"); + result->snippet = strdup(snippet ? snippet : ""); +} + +void free_xml_node_list(char *title, char *url, char *snippet) { + if (title) + xmlFree(title); + if (url) + xmlFree(url); + if (snippet) + xmlFree(snippet); +} + +static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + (void)engine_name; + int found_count = 0; + + xmlXPathContextPtr ctx = create_xpath_context(doc); + if (!ctx) + return 0; + + xmlXPathObjectPtr obj = + xml_xpath_eval(ctx, "//tr[not(contains(@class, " + "'result-sponsored'))]//a[@class='result-link']"); + + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + free_xpath_objects(ctx, obj); + return 0; + } + + int num_links = obj->nodesetval->nodeNr; + *out_results = alloc_results_array(num_links, max_results); + if (!*out_results) { + free_xpath_objects(ctx, obj); + return 0; + } + + for (int i = 0; i < num_links && found_count < max_results; i++) { + xmlNodePtr link_node = obj->nodesetval->nodeTab[i]; + char *title = xml_node_content(link_node); + char *url = (char *)xmlGetProp(link_node, (xmlChar *)"href"); + char *snippet_text = NULL; + + xmlNodePtr current = link_node->parent; + while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0) + current = current->parent; + + if (current && current->next) { + xmlNodePtr snippet_row = current->next; + while (snippet_row && + xmlStrcasecmp(snippet_row->name, (const xmlChar *)"tr") != 0) + snippet_row = snippet_row->next; + if (snippet_row) { + ctx->node = snippet_row; + xmlXPathObjectPtr s_obj = + xml_xpath_eval(ctx, ".//td[@class='result-snippet']"); + if (s_obj && s_obj->nodesetval && s_obj->nodesetval->nodeNr > 0) + snippet_text = xml_node_content(s_obj->nodesetval->nodeTab[0]); + if (s_obj) + xmlXPathFreeObject(s_obj); + ctx->node = NULL; + } + } + + assign_result(&(*out_results)[found_count], url, title, snippet_text, 1); + free_xml_node_list(title, url, snippet_text); + found_count++; + } + + free_xpath_objects(ctx, obj); + return found_count; +} + +static int parse_startpage(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + (void)engine_name; + int found_count = 0; + + xmlXPathContextPtr ctx = create_xpath_context(doc); + if (!ctx) + return 0; + + xmlXPathObjectPtr obj = + xml_xpath_eval(ctx, "//div[contains(@class, 'result')]"); + + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + free_xpath_objects(ctx, obj); + return 0; + } + + int num_results = obj->nodesetval->nodeNr; + *out_results = alloc_results_array(num_results, max_results); + if (!*out_results) { + free_xpath_objects(ctx, obj); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr result_node = obj->nodesetval->nodeTab[i]; + ctx->node = result_node; + + xmlXPathObjectPtr link_obj = + xml_xpath_eval(ctx, ".//a[contains(@class, 'result-link')]"); + char *url = + (link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr title_obj = + xml_xpath_eval(ctx, ".//h2[contains(@class, 'wgl-title')]"); + char *title = (title_obj && title_obj->nodesetval && + title_obj->nodesetval->nodeNr > 0) + ? xml_node_content(title_obj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippet_obj = + xml_xpath_eval(ctx, ".//p[contains(@class, 'description')]"); + char *snippet_text = + (snippet_obj && snippet_obj->nodesetval && + snippet_obj->nodesetval->nodeNr > 0) + ? xml_node_content(snippet_obj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + assign_result(&(*out_results)[found_count], url, title, snippet_text, 0); + found_count++; + } + + free_xml_node_list(title, url, snippet_text); + if (link_obj) + xmlXPathFreeObject(link_obj); + if (title_obj) + xmlXPathFreeObject(title_obj); + if (snippet_obj) + xmlXPathFreeObject(snippet_obj); + } + + ctx->node = NULL; + free_xpath_objects(ctx, obj); + return found_count; +} + +static int parse_yahoo(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + (void)engine_name; + int found_count = 0; + + xmlXPathContextPtr ctx = create_xpath_context(doc); + if (!ctx) + return 0; + + xmlXPathObjectPtr obj = + xml_xpath_eval(ctx, "//div[contains(@class, 'algo-sr')]"); + + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + free_xpath_objects(ctx, obj); + return 0; + } + + int num_results = obj->nodesetval->nodeNr; + *out_results = alloc_results_array(num_results, max_results); + if (!*out_results) { + free_xpath_objects(ctx, obj); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr result_node = obj->nodesetval->nodeTab[i]; + ctx->node = result_node; + + xmlXPathObjectPtr link_obj = xml_xpath_eval( + ctx, ".//div[contains(@class, 'compTitle')]//a[@target='_blank']"); + char *url = + (link_obj && link_obj->nodesetval && link_obj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(link_obj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr title_obj = + xml_xpath_eval(ctx, ".//h3[contains(@class, 'title')]"); + char *title = (title_obj && title_obj->nodesetval && + title_obj->nodesetval->nodeNr > 0) + ? xml_node_content(title_obj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippet_obj = + xml_xpath_eval(ctx, ".//div[contains(@class, 'compText')]//p"); + char *snippet_text = + (snippet_obj && snippet_obj->nodesetval && + snippet_obj->nodesetval->nodeNr > 0) + ? xml_node_content(snippet_obj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + assign_result(&(*out_results)[found_count], url, title, snippet_text, 1); + found_count++; + } + + free_xml_node_list(title, url, snippet_text); + if (link_obj) + xmlXPathFreeObject(link_obj); + if (title_obj) + xmlXPathFreeObject(title_obj); + if (snippet_obj) + xmlXPathFreeObject(snippet_obj); + } + + ctx->node = NULL; + free_xpath_objects(ctx, obj); + return found_count; +} + +const SearchEngine ENGINE_REGISTRY[] = { + {.name = "DuckDuckGo Lite", + .base_url = "https://lite.duckduckgo.com/lite/?q=", + .host_header = "lite.duckduckgo.com", + .referer = "https://lite.duckduckgo.com/", + .page_param = "s", + .page_multiplier = 30, + .page_base = 0, + .parser = parse_ddg_lite}, + {.name = "Startpage", + .base_url = "https://www.startpage.com/sp/search?q=", + .host_header = "www.startpage.com", + .referer = "https://www.startpage.com/", + .page_param = "page", + .page_multiplier = 1, + .page_base = 1, + .parser = parse_startpage}, + {.name = "Yahoo", + .base_url = "https://search.yahoo.com/search?p=", + .host_header = "search.yahoo.com", + .referer = "https://search.yahoo.com/", + .page_param = "b", + .page_multiplier = 10, + .page_base = 1, + .parser = parse_yahoo}}; + +const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine); |
