diff --git a/packages/components/credentials/OxylabsApi.credential.ts b/packages/components/credentials/OxylabsApi.credential.ts new file mode 100644 index 00000000000..4ecce3c8e32 --- /dev/null +++ b/packages/components/credentials/OxylabsApi.credential.ts @@ -0,0 +1,30 @@ +import { INodeParams, INodeCredential } from '../src/Interface' + +class OxylabsApiCredential implements INodeCredential { + label: string + name: string + version: number + description: string + inputs: INodeParams[] + + constructor() { + this.label = 'Oxylabs API' + this.name = 'oxylabsApi' + this.version = 1.0 + this.description = 'Oxylabs API credentials description, to add more info' + this.inputs = [ + { + label: 'Oxylabs Username', + name: 'username', + type: 'string' + }, + { + label: 'Oxylabs Password', + name: 'password', + type: 'password' + } + ] + } +} + +module.exports = { credClass: OxylabsApiCredential } diff --git a/packages/components/nodes/documentloaders/Oxylabs/Oxylabs.ts b/packages/components/nodes/documentloaders/Oxylabs/Oxylabs.ts new file mode 100644 index 00000000000..a2af08f459e --- /dev/null +++ b/packages/components/nodes/documentloaders/Oxylabs/Oxylabs.ts @@ -0,0 +1,381 @@ +import { TextSplitter } from 'langchain/text_splitter' +import { DocumentInterface } from '@langchain/core/documents' +import { BaseDocumentLoader } from 'langchain/document_loaders/base' +import { INode, INodeData, INodeParams, ICommonObject, INodeOutputsValue } from '../../../src/Interface' +import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils' +import axios, { AxiosResponse } from 'axios' + +interface OxylabsDocument extends DocumentInterface {} + +interface OxylabsResponse { + results: Result[] + job: Job +} + +interface Result { + content: any + created_at: string + updated_at: string + page: number + url: string + job_id: string + is_render_forced: boolean + status_code: number + parser_type: string +} + +interface Job { + callback_url: string + client_id: number + context: any + created_at: string + domain: string + geo_location: any + id: string + limit: number + locale: any + pages: number + parse: boolean + parser_type: any + parser_preset: any + parsing_instructions: any + browser_instructions: any + render: any + url: any + query: string + source: string + start_page: number + status: string + storage_type: any + storage_url: any + subdomain: string + content_encoding: string + updated_at: string + user_agent_type: string + is_premium_domain: boolean +} + +interface OxylabsLoaderParameters { + username: string + password: string + query: string + source: string + geo_location: string + render: boolean + parse: boolean + user_agent_type: string +} + +export class OxylabsLoader extends BaseDocumentLoader { + private params: OxylabsLoaderParameters + + constructor(loaderParams: OxylabsLoaderParameters) { + super() + this.params = loaderParams + } + + private async sendAPIRequest(params: any): Promise> { + params = Object.fromEntries(Object.entries(params).filter(([_, value]) => value !== null && value !== '' && value !== undefined)) + + const auth = btoa(`${this.params.username}:${this.params.password}`) + + const response = await axios.post('https://realtime.oxylabs.io/v1/queries', params, { + headers: { + 'Content-Type': 'application/json', + 'x-oxylabs-sdk': 'oxylabs-integration-flowise/1.0.0 (1.0.0; 64bit)', + Authorization: `Basic ${auth}` + } + }) + + if (response.status >= 400) { + throw new Error(`Oxylabs: Failed to call Oxylabs API: ${response.status}`) + } + + return response + } + + private async getUniversal(): Promise> { + return await this.sendAPIRequest({ + url: this.params.query, + source: 'universal', + geo_location: this.params.geo_location, + render: this.params.render, + parse: this.params.parse, + user_agent_type: this.params.user_agent_type + }) + } + + private async getGoogleSearch(): Promise> { + return await this.sendAPIRequest({ + query: this.params.query, + source: 'google_search', + geo_location: this.params.geo_location, + render: this.params.render, + parse: this.params.parse, + user_agent_type: this.params.user_agent_type + }) + } + + private async getAmazonProduct(): Promise> { + return await this.sendAPIRequest({ + query: this.params.query, + source: 'amazon_product', + geo_location: this.params.geo_location, + render: this.params.render, + parse: this.params.parse, + user_agent_type: this.params.user_agent_type + }) + } + + private async getAmazonSearch(): Promise> { + return await this.sendAPIRequest({ + query: this.params.query, + source: 'amazon_search', + geo_location: this.params.geo_location, + render: this.params.render, + parse: this.params.parse, + user_agent_type: this.params.user_agent_type + }) + } + + public async load(): Promise { + let response + if (this.params.source === 'google_search') { + response = await this.getGoogleSearch() + } else if (this.params.source === 'amazon_product') { + response = await this.getAmazonProduct() + } else if (this.params.source === 'amazon_search') { + response = await this.getAmazonSearch() + } else { + response = await this.getUniversal() + } + + const docs: OxylabsDocument[] = [ + { + id: response.data.job.id.toString(), + pageContent: response.data.results[0].content, + metadata: {} + } + ] + + return docs + } +} + +class Oxylabs_DocumentLoaders implements INode { + label: string + name: string + description: string + type: string + icon: string + version: number + category: string + baseClasses: string[] + inputs: INodeParams[] + credential: INodeParams + outputs: INodeOutputsValue[] + + constructor() { + this.label = 'Oxylabs' + this.name = 'oxylabs' + this.type = 'Document' + this.icon = 'oxylabs.svg' + this.version = 1.0 + this.category = 'Document Loaders' + this.description = 'Extract data from URLs using Oxylabs' + this.baseClasses = [this.type] + this.credential = { + label: 'Oxylabs API', + name: 'credential', + type: 'credential', + credentialNames: ['oxylabsApi'] + } + this.inputs = [ + { + label: 'Text Splitter', + name: 'textSplitter', + type: 'TextSplitter', + optional: false + }, + { + label: 'Query', + name: 'query', + type: 'string', + description: + 'Website URL of query keyword.' + }, + { + label: 'Source', + name: 'source', + type: 'options', + description: + 'Target website to scrape.', + options: [ + { + label: 'Universal', + name: 'universal' + }, + { + label: 'Google Search', + name: 'google_search' + }, + { + label: 'Amazon Product', + name: 'amazon_product' + }, + { + label: 'Amazon Search', + name: 'amazon_search' + } + ], + default: 'universal' + }, + { + label: 'Geolocation', + name: 'geo_location', + type: 'string', + description: + "Sets the proxy's geo location to retrieve data. Check Oxylabs documentation for more details.", + optional: true + }, + { + label: 'Render', + name: 'render', + type: 'boolean', + description: 'Enables JavaScript rendering when set to true.', + optional: true, + default: false + }, + { + label: 'Parse', + name: 'parse', + type: 'boolean', + description: + "Returns parsed data when set to true, as long as a dedicated parser exists for the submitted URL's page type.", + optional: true, + default: false + }, + { + label: 'User Agent Type', + name: 'user_agent_type', + type: 'options', + description: 'Device type and browser.', + options: [ + { + label: 'Desktop', + name: 'desktop' + }, + { + label: 'Desktop Chrome', + name: 'desktop_chrome' + }, + { + label: 'Desktop Edge', + name: 'desktop_edge' + }, + { + label: 'Desktop Firefox', + name: 'desktop_firefox' + }, + { + label: 'Desktop Opera', + name: 'desktop_opera' + }, + { + label: 'Desktop Safari', + name: 'desktop_safari' + }, + { + label: 'Mobile', + name: 'mobile' + }, + { + label: 'Mobile Android', + name: 'mobile_android' + }, + { + label: 'Mobile iOS', + name: 'mobile_ios' + }, + { + label: 'Tablet', + name: 'tablet' + }, + { + label: 'Tablet Android', + name: 'tablet_android' + }, + { + label: 'Tablet iOS', + name: 'tablet_ios' + } + ], + optional: true + } + ] + this.outputs = [ + { + label: 'Document', + name: 'document', + description: 'Array of document objects containing metadata and pageContent', + baseClasses: [...this.baseClasses, 'json'] + }, + { + label: 'Text', + name: 'text', + description: 'Concatenated string from pageContent of documents', + baseClasses: ['string', 'json'] + } + ] + } + + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise { + const query = nodeData.inputs?.query as string + const textSplitter = nodeData.inputs?.textSplitter as TextSplitter + const source = nodeData.inputs?.source as string + const geo_location = nodeData.inputs?.geo_location as string + const render = nodeData.inputs?.render as boolean + const parse = nodeData.inputs?.parse as boolean + const user_agent_type = nodeData.inputs?.user_agent_type as string + + const credentialData = await getCredentialData(nodeData.credential ?? '', options) + const username = getCredentialParam('username', credentialData, nodeData) + const password = getCredentialParam('password', credentialData, nodeData) + + const output = nodeData.outputs?.output as string + + const input: OxylabsLoaderParameters = { + username, + password, + query, + source, + geo_location, + render, + parse, + user_agent_type + } + + const loader = new OxylabsLoader(input) + + let docs: OxylabsDocument[] = await loader.load() + + if (textSplitter && docs.length > 0) { + docs = await textSplitter.splitDocuments(docs) + } + + if (output === 'document') { + return docs + } else { + let finaltext = '' + for (const doc of docs) { + finaltext += `${doc.pageContent}\n` + } + return handleEscapeCharacters(finaltext, false) + } + } +} + +module.exports = { nodeClass: Oxylabs_DocumentLoaders } + +// FOR TESTING PURPOSES +// export { Oxylabs_DocumentLoaders } diff --git a/packages/components/nodes/documentloaders/Oxylabs/oxylabs.svg b/packages/components/nodes/documentloaders/Oxylabs/oxylabs.svg new file mode 100644 index 00000000000..2ddd10b01a3 --- /dev/null +++ b/packages/components/nodes/documentloaders/Oxylabs/oxylabs.svg @@ -0,0 +1,4 @@ + + + +