FIX #2617 Cherio Web Crawler doesn't work with large sites (#2678)

* FIX #2617 Big sites scan error

* FIX #2617 Big sites scan error - review fix

---------

Co-authored-by: Ahmed Osman <ahmed.osman@evolpe.pl>
pull/2765/head
Ahmed Osman 2024-07-05 12:34:47 +02:00 committed by GitHub
parent b1e38783e4
commit 90558ca688
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 6 additions and 1 deletions

View File

@ -131,7 +131,11 @@ class Cheerio_DocumentLoaders implements INode {
async function cheerioLoader(url: string): Promise<any> {
try {
let docs = []
let docs: IDocument[] = []
if (url.endsWith('.pdf')) {
if (process.env.DEBUG === 'true') options.logger.info(`CheerioWebBaseLoader does not support PDF files: ${url}`)
return docs
}
const loader = new CheerioWebBaseLoader(url, params)
if (textSplitter) {
docs = await loader.loadAndSplit(textSplitter)
@ -141,6 +145,7 @@ class Cheerio_DocumentLoaders implements INode {
return docs
} catch (err) {
if (process.env.DEBUG === 'true') options.logger.error(`error in CheerioWebBaseLoader: ${err.message}, on page: ${url}`)
return []
}
}