From 90558ca688298ccba83b18d0b41c5d1a89e10916 Mon Sep 17 00:00:00 2001 From: Ahmed Osman Date: Fri, 5 Jul 2024 12:34:47 +0200 Subject: [PATCH] FIX #2617 Cherio Web Crawler doesn't work with large sites (#2678) * FIX #2617 Big sites scan error * FIX #2617 Big sites scan error - review fix --------- Co-authored-by: Ahmed Osman --- .../components/nodes/documentloaders/Cheerio/Cheerio.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts index 966845b6..c17b539b 100644 --- a/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts +++ b/packages/components/nodes/documentloaders/Cheerio/Cheerio.ts @@ -131,7 +131,11 @@ class Cheerio_DocumentLoaders implements INode { async function cheerioLoader(url: string): Promise { try { - let docs = [] + let docs: IDocument[] = [] + if (url.endsWith('.pdf')) { + if (process.env.DEBUG === 'true') options.logger.info(`CheerioWebBaseLoader does not support PDF files: ${url}`) + return docs + } const loader = new CheerioWebBaseLoader(url, params) if (textSplitter) { docs = await loader.loadAndSplit(textSplitter) @@ -141,6 +145,7 @@ class Cheerio_DocumentLoaders implements INode { return docs } catch (err) { if (process.env.DEBUG === 'true') options.logger.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`) + return [] } }