diff --git a/examples/apify/agentic-synth/.actor/actor.json b/examples/apify/agentic-synth/.actor/actor.json index 41005301..3483781a 100644 --- a/examples/apify/agentic-synth/.actor/actor.json +++ b/examples/apify/agentic-synth/.actor/actor.json @@ -1,11 +1,11 @@ { "actorSpecification": 1, "name": "ai-synthetic-data-generator", - "title": "AI Synthetic Data Generator - Enterprise Mock Data with MCP & Actor Integration", - "description": "Generate unlimited, high-quality synthetic data with TRM/SONA self-learning. NEW: MCP server for AI agent integration + one-click data from Google Maps, Instagram, TikTok, Amazon scrapers. 6 use-case templates (Lead Intelligence, Competitor Monitor, Support Knowledge). Simulate Bloomberg terminals, medical records, supply chains, e-commerce, and more.", - "seoTitle": "AI Synthetic Data Generator - MCP Server & Actor Integration for RAG", - "seoDescription": "Generate synthetic data with MCP server support. One-click integration with Google Maps, Instagram, TikTok, Amazon scrapers. 6 pre-built templates for Lead Intelligence, Competitor Monitor, Support RAG. TRM/SONA self-learning.", - "version": "2.2", + "title": "AI Synthetic Data Generator - ONNX Embeddings, MCP & 21 Actor Integrations", + "description": "Generate unlimited synthetic data with ONNX-powered semantic embeddings (all-MiniLM-L6-v2, bge-small). 21 actor integrations (Reddit, Yelp, TripAdvisor, Zillow, Booking.com + more). 12 use-case templates. TRM/SONA self-learning. MCP server support.", + "seoTitle": "AI Synthetic Data Generator - ONNX Embeddings & 21 Actor Integrations", + "seoDescription": "Generate synthetic data with ONNX semantic embeddings. 21 actor integrations (Google Maps, Reddit, Yelp, Zillow). 12 templates. TRM/SONA self-learning.", + "version": "2.5", "buildTag": "latest", "minMemoryMbytes": 256, "maxMemoryMbytes": 4096, diff --git a/examples/apify/agentic-synth/.actor/input_schema.json b/examples/apify/agentic-synth/.actor/input_schema.json index 7381366d..c2e7e540 100644 --- a/examples/apify/agentic-synth/.actor/input_schema.json +++ b/examples/apify/agentic-synth/.actor/input_schema.json @@ -1,6 +1,6 @@ { - "title": "AI Synthetic Data Generator", - "description": "Generate unlimited, high-quality synthetic data for testing scrapers, training AI models, and building agentic applications. 18 data types including e-commerce, Bloomberg terminal, medical records, supply chain, and more. Now with one-click Apify actor integration and use case templates. Powered by TRM/SONA self-learning.", + "title": "AI Synthetic Data Generator v2.5", + "description": "Generate unlimited synthetic data with ONNX-powered semantic embeddings (all-MiniLM-L6-v2, bge-small). 21 actor integrations (Reddit, Yelp, TripAdvisor, Zillow, Booking.com + more). 12 use-case templates. TRM/SONA self-learning. MCP server support.", "type": "object", "schemaVersion": 1, "properties": { @@ -58,7 +58,7 @@ "integrateActorId": { "title": "Apify Actor to Integrate", "type": "string", - "description": "Actor ID to pull data from (e.g., apify/google-maps-scraper). Used in 'integrate' mode.", + "description": "Actor ID to pull data from. 21 actors supported. Used in 'integrate' mode.", "editor": "select", "enum": [ "apify/google-maps-scraper", @@ -73,7 +73,15 @@ "apify/website-content-crawler", "apify/cheerio-scraper", "apify/news-scraper", - "apify/linkedin-scraper" + "apify/linkedin-scraper", + "trudax/tripadvisor-scraper", + "maxcopell/yelp-scraper", + "trudax/booking-scraper", + "petr_cermak/zillow-scraper", + "epctex/craigslist-scraper", + "apify/reddit-scraper", + "apify/facebook-posts-scraper", + "compass/google-places-api" ], "enumTitles": [ "Google Maps Scraper - Local business data", @@ -88,10 +96,18 @@ "Website Content Crawler - Full site content", "Cheerio Scraper - Structured extraction", "News Scraper - News articles", - "LinkedIn Scraper - Jobs, profiles" + "LinkedIn Scraper - Jobs, profiles", + "TripAdvisor Scraper - Reviews, restaurants, hotels", + "Yelp Scraper - Business reviews, ratings", + "Booking.com Scraper - Hotels, accommodations", + "Zillow Scraper - Real estate listings", + "Craigslist Scraper - Classifieds, listings", + "Reddit Scraper - Posts, comments, subreddits", + "Facebook Posts Scraper - Posts, engagement", + "Google Places API - Places, reviews" ], - "sectionCaption": "Actor Integration", - "sectionDescription": "One-click integration with top Apify actors for RAG/memory use cases" + "sectionCaption": "Actor Integration (21 Actors)", + "sectionDescription": "One-click integration with 21 top Apify actors for RAG/memory use cases" }, "integrateRunId": { "title": "Run ID", @@ -116,18 +132,24 @@ "useTemplate": { "title": "Use Case Template", "type": "string", - "description": "Pre-built template for common use cases. Used in 'template' mode.", + "description": "Pre-built template for common use cases. 12 templates available. Used in 'template' mode.", "editor": "select", - "enum": ["lead-intelligence", "competitor-monitor", "support-knowledge", "research-assistant", "content-library", "product-catalog"], + "enum": ["lead-intelligence", "competitor-monitor", "support-knowledge", "research-assistant", "content-library", "product-catalog", "review-aggregator", "price-tracker", "social-listening", "talent-sourcing", "real-estate-intel", "travel-planner"], "enumTitles": [ "Lead Intelligence - Sales teams memorizing prospect data", "Competitor Monitor - Track competitor mentions/changes", "Support Knowledge - Customer support RAG system", "Research Assistant - Academic/market research", "Content Library - Content creators' reference", - "Product Catalog - E-commerce product memory" + "Product Catalog - E-commerce product memory", + "Review Aggregator - Aggregate reviews from multiple platforms", + "Price Tracker - Monitor prices for competitive intelligence", + "Social Listening - Monitor social conversations and trends", + "Talent Sourcing - Recruit candidates from job platforms", + "Real Estate Intel - Property market analysis", + "Travel Planner - Hotels, restaurants, activities data" ], - "sectionCaption": "Use Case Templates", + "sectionCaption": "Use Case Templates (12 Templates)", "sectionDescription": "Pre-configured templates for common data workflows" }, "schema": { @@ -183,7 +205,30 @@ "title": "Generate Embeddings", "type": "boolean", "description": "Generate vector embeddings for all output records (useful for RAG systems)", - "default": false + "default": false, + "sectionCaption": "ONNX Embeddings", + "sectionDescription": "Semantic embeddings powered by ONNX Runtime (Hugging Face Transformers.js)" + }, + "useOnnxEmbeddings": { + "title": "Use ONNX Semantic Embeddings", + "type": "boolean", + "description": "Use real semantic embeddings via ONNX (slower but more accurate) vs random vectors (fast for testing)", + "default": true + }, + "embeddingModel": { + "title": "ONNX Embedding Model", + "type": "string", + "description": "Choose embedding model. Smaller models are faster, larger models are more accurate.", + "editor": "select", + "default": "all-MiniLM-L6-v2", + "enum": ["all-MiniLM-L6-v2", "bge-small-en-v1.5", "all-mpnet-base-v2", "e5-small-v2", "gte-small"], + "enumTitles": [ + "all-MiniLM-L6-v2 (384d, Fast, Good) - Recommended", + "bge-small-en-v1.5 (384d, Fast, Excellent)", + "all-mpnet-base-v2 (768d, Medium, Excellent)", + "e5-small-v2 (384d, Fast, Very Good)", + "gte-small (384d, Fast, Very Good)" + ] }, "simulationMode": { "title": "Long-Running Simulation", diff --git a/examples/apify/agentic-synth/README.md b/examples/apify/agentic-synth/README.md index 99587c52..22c6933b 100644 --- a/examples/apify/agentic-synth/README.md +++ b/examples/apify/agentic-synth/README.md @@ -1,24 +1,24 @@ -# AI Synthetic Data Generator - MCP Server & Actor Integration +# AI Synthetic Data Generator - ONNX Embeddings, MCP & 21 Actor Integrations -**Generate unlimited synthetic data** grounded in real-world patterns. **One-click integration** with 13 popular Apify web scrapers (Google Maps, Instagram, TikTok, Amazon, LinkedIn) lets you transform real scraped data into AI-ready formats for RAG systems, agent memory, and model training. +**Generate unlimited synthetic data** with **ONNX-powered semantic embeddings** (all-MiniLM-L6-v2, bge-small). **One-click integration** with 21 popular Apify web scrapers (Google Maps, Instagram, TikTok, Amazon, LinkedIn, Reddit, Yelp, TripAdvisor, Zillow, Booking.com + more). 12 use-case templates. TRM/SONA self-learning. MCP server support. **Why grounding matters:** Pure synthetic data can drift from reality. By integrating with live Apify scrapers, your synthetic data inherits real naming conventions, price distributions, engagement patterns, and business characteristics - making your AI models and tests far more realistic. [![Apify Actor](https://img.shields.io/badge/Apify-Actor-blue)](https://apify.com/ruv/ai-synthetic-data-generator) [![MCP Server](https://img.shields.io/badge/MCP-Server-purple)](https://modelcontextprotocol.io) [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) -[![Version](https://img.shields.io/badge/version-2.2-green)](https://github.com/ruvnet/ruvector) +[![Version](https://img.shields.io/badge/version-2.5-green)](https://github.com/ruvnet/ruvector) -## What's New in v2.2 +## What's New in v2.5 | Feature | Description | |---------|-------------| -| **One-Click Scraper Integration** | Ground synthetic data with real patterns from Google Maps, Instagram, TikTok, YouTube, Amazon, LinkedIn, and 7 more scrapers | +| **ONNX Semantic Embeddings** | Real semantic embeddings via Hugging Face Transformers.js (all-MiniLM-L6-v2, bge-small, all-mpnet-base-v2, e5-small, gte-small) | +| **21 Actor Integrations** | Added Reddit, Yelp, TripAdvisor, Zillow, Booking.com, Craigslist, Facebook, Google Places | +| **12 Use Case Templates** | Added Review Aggregator, Price Tracker, Social Listening, Talent Sourcing, Real Estate Intel, Travel Planner | | **MCP Server** | Use as AI agent tool (Claude, GPT) via Model Context Protocol | -| **6 Use Case Templates** | Lead Intelligence, Competitor Monitor, Support RAG, and more | -| **Enhanced Grounding** | Transform scraped data into AI-ready formats that preserve real-world distributions | -| **Webhook Support** | POST results to your endpoint for async workflows | -| **Output Formats** | JSON, JSONL, CSV export options | +| **TRM/SONA Self-Learning** | 7M parameter recursive reasoning with 3-tier learning | +| **Output Formats** | JSON, JSONL, CSV export with webhook support | --- @@ -26,9 +26,10 @@ | Feature | Description | |---------|-------------| -| **Scraper Integration** | One-click grounding with 13 popular Apify scrapers for realistic data | +| **ONNX Embeddings** | 5 semantic embedding models via Transformers.js for RAG-ready vectors | +| **21 Actor Integrations** | One-click grounding with 21 popular Apify scrapers | +| **12 Templates** | Pre-built workflows for common use cases | | **MCP Server** | Integrate with Claude Code, GPT, and AI agents | -| **6 Templates** | Pre-built workflows for common use cases | | **TRM** | 7M parameter recursive reasoning (83% on GSM8K) | | **SONA** | 3-tier self-learning (Instant/Background/Deep) | | **EWC++** | Pattern preservation across generations (lambda=2000) | @@ -92,7 +93,35 @@ Once connected, you get these tools in your AI agent: --- -## One-Click Actor Integration +## ONNX Semantic Embeddings + +Generate real semantic embeddings for RAG systems using ONNX Runtime and Hugging Face Transformers.js. + +### Available Models + +| Model | Dimensions | Speed | Quality | Use Case | +|-------|------------|-------|---------|----------| +| **all-MiniLM-L6-v2** | 384 | Fast | Good | General purpose (recommended) | +| **bge-small-en-v1.5** | 384 | Fast | Excellent | High-quality retrieval | +| **all-mpnet-base-v2** | 768 | Medium | Excellent | Maximum accuracy | +| **e5-small-v2** | 384 | Fast | Very Good | Balanced performance | +| **gte-small** | 384 | Fast | Very Good | Efficient retrieval | + +### Example: Generate with ONNX Embeddings + +```json +{ + "dataType": "ecommerce", + "count": 100, + "generateEmbeddings": true, + "useOnnxEmbeddings": true, + "embeddingModel": "all-MiniLM-L6-v2" +} +``` + +--- + +## One-Click Actor Integration (21 Actors) Transform data from popular Apify scrapers into AI-ready format. @@ -113,6 +142,14 @@ Transform data from popular Apify scrapers into AI-ready format. | **apify/web-scraper** | General | Any web page data | | **apify/cheerio-scraper** | General | Structured extraction | | **apify/news-scraper** | News | Articles, authors, sources | +| **trudax/tripadvisor-scraper** | Reviews | Hotels, restaurants, reviews | +| **maxcopell/yelp-scraper** | Reviews | Business reviews, ratings | +| **trudax/booking-scraper** | Travel | Hotels, accommodations, prices | +| **petr_cermak/zillow-scraper** | Real Estate | Property listings, prices | +| **epctex/craigslist-scraper** | Classifieds | Listings, local posts | +| **apify/reddit-scraper** | Social Media | Posts, comments, subreddits | +| **apify/facebook-posts-scraper** | Social Media | Posts, engagement | +| **compass/google-places-api** | Local Business | Places, reviews, details | ### Example: Integrate Google Maps Data @@ -128,7 +165,7 @@ Transform data from popular Apify scrapers into AI-ready format. --- -## Use Case Templates +## Use Case Templates (12 Templates) One-click deployment for common AI/RAG scenarios. @@ -140,6 +177,12 @@ One-click deployment for common AI/RAG scenarios. | **research-assistant** | Academic/market research | Researchers | Google Search, News, Content | | **content-library** | Content creators' reference | Creators | Instagram, TikTok, YouTube | | **product-catalog** | E-commerce product memory | E-commerce | Amazon, Shopify, Google Maps | +| **review-aggregator** | Aggregate reviews from platforms | Product Managers | TripAdvisor, Yelp, Google Maps | +| **price-tracker** | Monitor prices across sites | Pricing Teams | Amazon, Zillow, Booking.com | +| **social-listening** | Monitor social conversations | Social Media, PR | Reddit, Twitter, Facebook | +| **talent-sourcing** | Recruit from job platforms | Recruiters, HR | LinkedIn, Craigslist | +| **real-estate-intel** | Property market analysis | Real Estate, Investors | Zillow, Google Maps, Craigslist | +| **travel-planner** | Hotels, restaurants, activities | Travel Agents | TripAdvisor, Booking.com, Google Maps | ### Example: Lead Intelligence Template diff --git a/examples/apify/agentic-synth/package.json b/examples/apify/agentic-synth/package.json index a828deab..983dcaaf 100644 --- a/examples/apify/agentic-synth/package.json +++ b/examples/apify/agentic-synth/package.json @@ -1,7 +1,7 @@ { "name": "agentic-synth-apify-actor", - "version": "2.2.0", - "description": "AI Synthetic Data Generator with TRM/SONA self-learning, MCP server, and Apify actor integrations", + "version": "2.5.0", + "description": "AI Synthetic Data Generator with ONNX embeddings, TRM/SONA self-learning, MCP server, and 21 Apify actor integrations", "main": "src/main.js", "type": "module", "bin": { @@ -19,6 +19,7 @@ "@google/generative-ai": "^0.24.1", "@modelcontextprotocol/sdk": "^1.0.0", "@ruvector/ruvllm": "^0.2.3", + "@xenova/transformers": "^2.17.2", "apify": "^3.5.2" }, "optionalDependencies": { diff --git a/examples/apify/agentic-synth/src/embeddings.js b/examples/apify/agentic-synth/src/embeddings.js new file mode 100644 index 00000000..b8058994 --- /dev/null +++ b/examples/apify/agentic-synth/src/embeddings.js @@ -0,0 +1,96 @@ +/** + * ONNX-based Embeddings Module + * Uses @xenova/transformers (Hugging Face Transformers.js) for ONNX Runtime embeddings + */ + +import { log } from 'apify'; + +let embeddingPipeline = null; +let currentModel = null; + +export const EMBEDDING_MODELS = { + 'all-MiniLM-L6-v2': { id: 'Xenova/all-MiniLM-L6-v2', dimensions: 384, speed: 'fast', quality: 'good' }, + 'bge-small-en-v1.5': { id: 'Xenova/bge-small-en-v1.5', dimensions: 384, speed: 'fast', quality: 'excellent' }, + 'all-mpnet-base-v2': { id: 'Xenova/all-mpnet-base-v2', dimensions: 768, speed: 'medium', quality: 'excellent' }, + 'e5-small-v2': { id: 'Xenova/e5-small-v2', dimensions: 384, speed: 'fast', quality: 'very-good' }, + 'gte-small': { id: 'Xenova/gte-small', dimensions: 384, speed: 'fast', quality: 'very-good' } +}; + +export async function initEmbeddingPipeline(modelName = 'all-MiniLM-L6-v2') { + const modelConfig = EMBEDDING_MODELS[modelName]; + if (!modelConfig) throw new Error(`Unknown model: ${modelName}`); + if (embeddingPipeline && currentModel === modelName) return embeddingPipeline; + + const { pipeline } = await import('@xenova/transformers'); + log.info(`Loading ONNX embedding model: ${modelConfig.id}...`); + embeddingPipeline = await pipeline('feature-extraction', modelConfig.id, { quantized: true }); + currentModel = modelName; + return embeddingPipeline; +} + +export async function generateEmbedding(text, options = {}) { + const { modelName = 'all-MiniLM-L6-v2', normalize = true } = options; + const pipe = await initEmbeddingPipeline(modelName); + const output = await pipe(text.substring(0, 8000), { pooling: 'mean', normalize }); + return Array.from(output.data); +} + +export async function generateEmbeddingsBatch(texts, options = {}) { + const { modelName = 'all-MiniLM-L6-v2', batchSize = 32, onProgress = null } = options; + const pipe = await initEmbeddingPipeline(modelName); + const embeddings = []; + + for (let i = 0; i < texts.length; i += batchSize) { + const batch = texts.slice(i, i + batchSize); + const batchEmbeddings = await Promise.all( + batch.map(async (text) => { + const output = await pipe((text || '').substring(0, 8000), { pooling: 'mean', normalize: true }); + return Array.from(output.data); + }) + ); + embeddings.push(...batchEmbeddings); + if (onProgress) onProgress({ processed: Math.min(i + batchSize, texts.length), total: texts.length }); + } + return embeddings; +} + +export async function addEmbeddingsToRecords(records, options = {}) { + const { modelName = 'all-MiniLM-L6-v2', textFields = ['title', 'description', 'text', 'content', 'caption', 'body', 'name'] } = options; + if (!records?.length) return records; + + const modelConfig = EMBEDDING_MODELS[modelName]; + log.info(`Generating ONNX embeddings for ${records.length} records with ${modelName}`); + + const texts = records.map(record => { + const parts = textFields.map(f => { + const v = record[f] || record.data?.[f]; + return typeof v === 'string' ? v : Array.isArray(v) ? v.join(' ') : ''; + }).filter(Boolean); + return parts.join(' ') || 'empty'; + }); + + const embeddings = await generateEmbeddingsBatch(texts, { modelName }); + return records.map((record, i) => ({ ...record, embedding: embeddings[i], embeddingModel: modelName, embeddingDimensions: modelConfig.dimensions })); +} + +export function generateRandomEmbedding(dimensions, random = Math.random) { + const embedding = []; + let norm = 0; + for (let i = 0; i < dimensions; i++) { + const val = random() * 2 - 1; + embedding.push(val); + norm += val * val; + } + norm = Math.sqrt(norm); + return embedding.map(v => Math.round((v / norm) * 1000000) / 1000000); +} + +export function cosineSimilarity(a, b) { + let dot = 0, normA = 0, normB = 0; + for (let i = 0; i < a.length; i++) { + dot += a[i] * b[i]; + normA += a[i] * a[i]; + normB += b[i] * b[i]; + } + return dot / (Math.sqrt(normA) * Math.sqrt(normB)); +} diff --git a/examples/apify/agentic-synth/src/integrations.js b/examples/apify/agentic-synth/src/integrations.js index 2bf24972..4a0a04b7 100644 --- a/examples/apify/agentic-synth/src/integrations.js +++ b/examples/apify/agentic-synth/src/integrations.js @@ -98,6 +98,62 @@ export const SUPPORTED_ACTORS = { category: 'professional', defaultFields: ['title', 'company', 'location', 'description', 'salary'], transform: transformLinkedIn + }, + + // Reviews & Local + 'trudax/tripadvisor-scraper': { + name: 'TripAdvisor Scraper', + category: 'reviews', + defaultFields: ['name', 'rating', 'reviewCount', 'address', 'priceLevel', 'cuisine'], + transform: transformTripAdvisor + }, + 'maxcopell/yelp-scraper': { + name: 'Yelp Scraper', + category: 'reviews', + defaultFields: ['name', 'rating', 'reviewCount', 'address', 'categories', 'phone'], + transform: transformYelp + }, + 'trudax/booking-scraper': { + name: 'Booking.com Scraper', + category: 'travel', + defaultFields: ['name', 'rating', 'price', 'location', 'amenities', 'reviewScore'], + transform: transformBooking + }, + + // Real Estate + 'petr_cermak/zillow-scraper': { + name: 'Zillow Scraper', + category: 'real-estate', + defaultFields: ['address', 'price', 'bedrooms', 'bathrooms', 'sqft', 'propertyType'], + transform: transformZillow + }, + 'epctex/craigslist-scraper': { + name: 'Craigslist Scraper', + category: 'classifieds', + defaultFields: ['title', 'price', 'location', 'description', 'category', 'postedAt'], + transform: transformCraigslist + }, + + // Social Platforms + 'apify/reddit-scraper': { + name: 'Reddit Scraper', + category: 'social-media', + defaultFields: ['title', 'text', 'subreddit', 'score', 'comments', 'author'], + transform: transformReddit + }, + 'apify/facebook-posts-scraper': { + name: 'Facebook Posts Scraper', + category: 'social-media', + defaultFields: ['text', 'likes', 'comments', 'shares', 'author', 'timestamp'], + transform: transformFacebook + }, + + // Places & Maps + 'compass/google-places-api': { + name: 'Google Places API', + category: 'local-business', + defaultFields: ['name', 'rating', 'address', 'phone', 'website', 'types', 'priceLevel'], + transform: transformGooglePlaces } }; @@ -217,6 +273,128 @@ export const USE_CASE_TEMPLATES = { recommendedAlternatives: 'array', embedding: 'array' } + }, + + 'review-aggregator': { + name: 'Review Aggregator', + description: 'Aggregate and analyze reviews from multiple platforms', + targetUsers: ['Product Managers', 'Brand Managers', 'Customer Experience'], + suggestedActors: ['trudax/tripadvisor-scraper', 'maxcopell/yelp-scraper', 'apify/google-maps-scraper'], + memorizeFields: ['name', 'rating', 'reviewCount', 'text', 'sentiment', 'categories'], + enrichWith: ['sentiment_score', 'common_themes', 'rating_trend'], + outputFormat: { + entityId: 'string', + name: 'string', + averageRating: 'number', + totalReviews: 'number', + platforms: 'array', + sentimentAnalysis: 'object', + topPraises: 'array', + topComplaints: 'array', + embedding: 'array' + } + }, + + 'price-tracker': { + name: 'Price Tracker', + description: 'Monitor prices across platforms for competitive intelligence', + targetUsers: ['Pricing Teams', 'Buyers', 'Resellers'], + suggestedActors: ['apify/amazon-scraper', 'petr_cermak/zillow-scraper', 'trudax/booking-scraper'], + memorizeFields: ['title', 'price', 'currency', 'availability', 'seller', 'timestamp'], + enrichWith: ['price_history', 'price_alerts', 'competitor_comparison'], + outputFormat: { + productId: 'string', + title: 'string', + currentPrice: 'number', + priceHistory: 'array', + lowestPrice: 'number', + highestPrice: 'number', + priceChange: 'number', + competitors: 'array', + embedding: 'array' + } + }, + + 'social-listening': { + name: 'Social Listening', + description: 'Monitor social conversations about brands, topics, and trends', + targetUsers: ['Social Media Managers', 'PR Teams', 'Brand Managers'], + suggestedActors: ['apify/reddit-scraper', 'apify/twitter-scraper', 'apify/facebook-posts-scraper'], + memorizeFields: ['text', 'author', 'engagement', 'sentiment', 'platform', 'timestamp'], + enrichWith: ['sentiment_analysis', 'influencer_score', 'viral_potential'], + outputFormat: { + postId: 'string', + platform: 'string', + content: 'string', + author: 'object', + engagement: 'object', + sentiment: 'number (-1 to 1)', + mentions: 'array', + hashtags: 'array', + viralScore: 'number (1-100)', + embedding: 'array' + } + }, + + 'talent-sourcing': { + name: 'Talent Sourcing', + description: 'Recruit and source candidates from job platforms', + targetUsers: ['Recruiters', 'HR Teams', 'Talent Acquisition'], + suggestedActors: ['apify/linkedin-scraper', 'epctex/craigslist-scraper'], + memorizeFields: ['title', 'company', 'location', 'skills', 'experience', 'salary'], + enrichWith: ['skill_match', 'culture_fit', 'availability'], + outputFormat: { + candidateId: 'string', + name: 'string', + currentRole: 'string', + company: 'string', + skills: 'array', + experience: 'number', + location: 'string', + matchScore: 'number (1-100)', + embedding: 'array' + } + }, + + 'real-estate-intel': { + name: 'Real Estate Intelligence', + description: 'Market analysis and property intelligence for real estate', + targetUsers: ['Real Estate Agents', 'Investors', 'Property Managers'], + suggestedActors: ['petr_cermak/zillow-scraper', 'apify/google-maps-scraper', 'epctex/craigslist-scraper'], + memorizeFields: ['address', 'price', 'sqft', 'bedrooms', 'bathrooms', 'propertyType'], + enrichWith: ['market_trends', 'comparable_sales', 'neighborhood_score'], + outputFormat: { + propertyId: 'string', + address: 'string', + price: 'number', + pricePerSqft: 'number', + propertyType: 'string', + specs: 'object', + marketAnalysis: 'object', + comparables: 'array', + investmentScore: 'number (1-100)', + embedding: 'array' + } + }, + + 'travel-planner': { + name: 'Travel Planner', + description: 'Plan trips with aggregated hotel, restaurant, and activity data', + targetUsers: ['Travel Agents', 'Travelers', 'Tourism Boards'], + suggestedActors: ['trudax/tripadvisor-scraper', 'trudax/booking-scraper', 'apify/google-maps-scraper'], + memorizeFields: ['name', 'rating', 'price', 'location', 'amenities', 'reviews'], + enrichWith: ['booking_availability', 'best_time_to_visit', 'local_tips'], + outputFormat: { + placeId: 'string', + name: 'string', + type: 'string (hotel, restaurant, attraction)', + rating: 'number', + priceRange: 'string', + location: 'object', + highlights: 'array', + reviews: 'array', + embedding: 'array' + } } }; @@ -515,6 +693,225 @@ function transformLinkedIn(item) { }; } +function transformTripAdvisor(item) { + return { + id: item.id || item.locationId || generateId(), + source: 'tripadvisor', + name: item.name || item.title, + type: item.type || item.category, + rating: item.rating || item.overallRating, + reviewCount: item.reviewCount || item.numberOfReviews, + priceLevel: item.priceLevel || item.priceRange, + address: { + full: item.address || item.addressObj?.street1, + city: item.city || item.addressObj?.city, + country: item.country || item.addressObj?.country + }, + cuisine: item.cuisine || item.cuisines || [], + features: item.features || item.amenities || [], + awards: item.awards || [], + photos: (item.photos || []).slice(0, 5).map(p => p.url || p), + url: item.url || item.webUrl, + scrapedAt: item.scrapedAt || new Date().toISOString() + }; +} + +function transformYelp(item) { + return { + id: item.id || item.businessId || generateId(), + source: 'yelp', + name: item.name || item.businessName, + rating: item.rating, + reviewCount: item.reviewCount || item.review_count, + priceLevel: item.price || item.priceRange, + address: { + full: item.address || item.location?.display_address?.join(', '), + street: item.location?.address1, + city: item.location?.city, + state: item.location?.state, + zip: item.location?.zip_code + }, + phone: item.phone || item.display_phone, + categories: (item.categories || []).map(c => c.title || c), + hours: item.hours || item.businessHours, + photos: (item.photos || []).slice(0, 5), + isClaimed: item.is_claimed, + url: item.url, + scrapedAt: item.scrapedAt || new Date().toISOString() + }; +} + +function transformBooking(item) { + return { + id: item.id || item.hotelId || generateId(), + source: 'booking', + name: item.name || item.hotelName, + type: item.type || item.accommodationType || 'hotel', + rating: item.rating || item.reviewScore, + reviewScore: item.reviewScore || item.score, + reviewCount: item.reviewCount || item.numberOfReviews, + stars: item.stars || item.starRating, + price: { + amount: item.price || item.priceAmount, + currency: item.currency || 'USD', + perNight: item.pricePerNight || item.price + }, + location: { + address: item.address, + city: item.city, + country: item.country, + lat: item.latitude || item.location?.lat, + lng: item.longitude || item.location?.lng + }, + amenities: item.amenities || item.facilities || [], + photos: (item.photos || []).slice(0, 5).map(p => p.url || p), + url: item.url, + scrapedAt: item.scrapedAt || new Date().toISOString() + }; +} + +function transformZillow(item) { + return { + id: item.zpid || item.id || generateId(), + source: 'zillow', + address: { + full: item.address || item.streetAddress, + street: item.streetAddress, + city: item.city, + state: item.state, + zip: item.zipcode + }, + price: item.price || item.zestimate, + zestimate: item.zestimate, + rentZestimate: item.rentZestimate, + propertyType: item.homeType || item.propertyType, + status: item.homeStatus || item.status, + specs: { + bedrooms: item.bedrooms || item.beds, + bathrooms: item.bathrooms || item.baths, + sqft: item.livingArea || item.sqft, + lotSize: item.lotSize || item.lotAreaValue, + yearBuilt: item.yearBuilt + }, + features: item.resoFacts?.atAGlanceFacts || [], + priceHistory: item.priceHistory || [], + taxHistory: item.taxHistory || [], + photos: (item.photos || item.hiResImageLink || []).slice(0, 5), + url: item.url || item.hdpUrl, + scrapedAt: item.scrapedAt || new Date().toISOString() + }; +} + +function transformCraigslist(item) { + return { + id: item.id || item.postId || generateId(), + source: 'craigslist', + title: item.title || item.postTitle, + price: item.price, + category: item.category || item.section, + subcategory: item.subcategory, + location: { + area: item.location || item.hood, + city: item.city, + region: item.region + }, + description: item.description || item.body, + attributes: item.attributes || {}, + images: (item.images || item.pics || []).slice(0, 5), + postedAt: item.datetime || item.postedAt, + updatedAt: item.updated, + url: item.url || item.postUrl, + scrapedAt: item.scrapedAt || new Date().toISOString() + }; +} + +function transformReddit(item) { + return { + id: item.id || item.postId || generateId(), + source: 'reddit', + type: item.type || (item.isSelf ? 'text' : 'link'), + title: item.title, + text: item.selftext || item.body || item.text, + subreddit: item.subreddit || item.subredditName, + author: { + username: item.author || item.authorName, + id: item.authorId + }, + engagement: { + score: item.score || item.ups - (item.downs || 0), + upvotes: item.ups, + downvotes: item.downs, + comments: item.numComments || item.num_comments, + awards: item.totalAwards || item.total_awards_received + }, + flair: item.linkFlair || item.link_flair_text, + nsfw: item.over18 || item.over_18 || false, + spoiler: item.spoiler || false, + url: item.url || `https://reddit.com${item.permalink}`, + mediaUrl: item.mediaUrl || item.url_overridden_by_dest, + createdAt: item.created || item.createdUtc, + scrapedAt: item.scrapedAt || new Date().toISOString() + }; +} + +function transformFacebook(item) { + return { + id: item.id || item.postId || generateId(), + source: 'facebook', + type: item.type || 'post', + text: item.text || item.message || item.content, + author: { + name: item.authorName || item.user?.name, + id: item.authorId || item.user?.id, + url: item.authorUrl || item.user?.url + }, + engagement: { + likes: item.likes || item.likesCount, + comments: item.comments || item.commentsCount, + shares: item.shares || item.sharesCount, + reactions: item.reactions || {} + }, + media: { + images: item.images || [], + videos: item.videos || [], + links: item.links || [] + }, + hashtags: extractHashtags(item.text || item.message), + mentions: extractMentions(item.text || item.message), + timestamp: item.time || item.timestamp || item.createdAt, + url: item.url || item.postUrl, + scrapedAt: item.scrapedAt || new Date().toISOString() + }; +} + +function transformGooglePlaces(item) { + return { + id: item.place_id || item.placeId || generateId(), + source: 'google-places', + name: item.name, + rating: item.rating, + reviewCount: item.user_ratings_total || item.reviewCount, + priceLevel: item.price_level || item.priceLevel, + address: item.formatted_address || item.address, + phone: item.formatted_phone_number || item.phone, + website: item.website, + types: item.types || [], + location: { + lat: item.geometry?.location?.lat || item.lat, + lng: item.geometry?.location?.lng || item.lng + }, + hours: item.opening_hours || item.hours, + photos: (item.photos || []).slice(0, 5).map(p => p.photo_reference || p), + reviews: (item.reviews || []).slice(0, 5).map(r => ({ + text: r.text, + rating: r.rating, + author: r.author_name + })), + url: item.url, + scrapedAt: item.scrapedAt || new Date().toISOString() + }; +} + // ============================================ // UTILITY FUNCTIONS // ============================================ diff --git a/examples/apify/agentic-synth/src/main.js b/examples/apify/agentic-synth/src/main.js index 5ba895ad..60817403 100644 --- a/examples/apify/agentic-synth/src/main.js +++ b/examples/apify/agentic-synth/src/main.js @@ -2,6 +2,7 @@ import { Actor, log } from 'apify'; import { GoogleGenerativeAI } from '@google/generative-ai'; import { createRequire } from 'module'; import { integrateActorData, SUPPORTED_ACTORS, USE_CASE_TEMPLATES, getTemplate, listSupportedActors, listTemplates } from './integrations.js'; +import { addEmbeddingsToRecords, generateRandomEmbedding, EMBEDDING_MODELS } from './embeddings.js'; // CJS import workaround for RuvLLM native extension const require = createRequire(import.meta.url); @@ -58,10 +59,13 @@ try { sonaEnabled = true, ewcLambda = 2000, patternThreshold = 0.7, - sonaLearningTiers = ['instant', 'background'] + sonaLearningTiers = ['instant', 'background'], + // ONNX Embedding parameters + useOnnxEmbeddings = true, + embeddingModel = 'all-MiniLM-L6-v2' } = input; - log.info('AI Synthetic Data Generator v2.2 with TRM/SONA', { mode, dataType, count, provider, model, sonaEnabled }); + log.info('AI Synthetic Data Generator v2.5 with ONNX Embeddings & TRM/SONA', { mode, dataType, count, provider, model, sonaEnabled, useOnnxEmbeddings, embeddingModel }); // Initialize SONA if available and enabled if (ruvllm && sonaEnabled) { @@ -274,33 +278,42 @@ try { // EMBEDDING GENERATION (optional) // ============================================ if (generateEmbeddings && generatedData.length > 0) { - log.info(`Generating embeddings with ${embeddingDimensions} dimensions...`); + const modelConfig = EMBEDDING_MODELS[embeddingModel] || EMBEDDING_MODELS['all-MiniLM-L6-v2']; + const effectiveDimensions = useOnnxEmbeddings ? modelConfig.dimensions : embeddingDimensions; - const random = createSeededRandom(seed); - - generatedData = generatedData.map((item, idx) => { - // Generate normalized random embedding - const embedding = []; - let norm = 0; - - for (let j = 0; j < embeddingDimensions; j++) { - const val = random() * 2 - 1; - embedding.push(val); - norm += val * val; - } - - norm = Math.sqrt(norm); - for (let j = 0; j < embeddingDimensions; j++) { - embedding[j] = Math.round((embedding[j] / norm) * 1000000) / 1000000; - } - - return { - ...item, - embedding, - embeddingDimensions - }; + log.info(`Generating embeddings with ${effectiveDimensions} dimensions...`, { + useOnnx: useOnnxEmbeddings, + model: useOnnxEmbeddings ? embeddingModel : 'random' }); + if (useOnnxEmbeddings) { + // Use ONNX-powered semantic embeddings + try { + generatedData = await addEmbeddingsToRecords(generatedData, { modelName: embeddingModel }); + log.info(`Added ONNX embeddings using ${embeddingModel} model`); + await Actor.charge({ eventName: 'onnx-embedding-generation', count: generatedData.length }); + } catch (e) { + log.warning(`ONNX embedding failed: ${e.message}. Falling back to random embeddings.`); + // Fall back to random embeddings + const random = createSeededRandom(seed); + generatedData = generatedData.map((item) => ({ + ...item, + embedding: generateRandomEmbedding(effectiveDimensions, random), + embeddingModel: 'random', + embeddingDimensions: effectiveDimensions + })); + } + } else { + // Use random embeddings (faster, for testing) + const random = createSeededRandom(seed); + generatedData = generatedData.map((item) => ({ + ...item, + embedding: generateRandomEmbedding(effectiveDimensions, random), + embeddingModel: 'random', + embeddingDimensions: effectiveDimensions + })); + } + // Charge for embedding generation await Actor.charge({ eventName: 'embedding-generation', count: generatedData.length }); log.info(`Added embeddings to ${generatedData.length} records`); @@ -476,7 +489,7 @@ try { method: 'POST', headers: { 'Content-Type': 'application/json', - 'User-Agent': 'Apify-AI-Synthetic-Data-Generator/2.2' + 'User-Agent': 'Apify-AI-Synthetic-Data-Generator/2.5' }, body: JSON.stringify(webhookPayload) }); diff --git a/examples/apify/neural-trader-system/apify_storage/key_value_stores/default/INPUT.json b/examples/apify/neural-trader-system/apify_storage/key_value_stores/default/INPUT.json new file mode 100644 index 00000000..771de761 --- /dev/null +++ b/examples/apify/neural-trader-system/apify_storage/key_value_stores/default/INPUT.json @@ -0,0 +1 @@ +{"mode":"backtest","symbols":["AAPL"]} diff --git a/examples/apify/neural-trader-system/src/main.js b/examples/apify/neural-trader-system/src/main.js index 43ec9376..9cff663a 100644 --- a/examples/apify/neural-trader-system/src/main.js +++ b/examples/apify/neural-trader-system/src/main.js @@ -1371,7 +1371,12 @@ await Actor.main(async () => { const signals = []; for (const symbol of symbols) { const marketData = generateMarketData(symbol, lookbackPeriod, { stopLoss, takeProfit, timeframe }); - const technicalData = { rsi: TechnicalIndicators.calculateRSI(marketData.prices), macd: TechnicalIndicators.calculateMACD(marketData.prices) }; + const technicalData = { + rsi: TechnicalIndicators.calculateRSI(marketData.prices), + macd: TechnicalIndicators.calculateMACD(marketData.prices), + bollinger: TechnicalIndicators.calculateBollinger(marketData.prices), + atr: TechnicalIndicators.calculateATR(marketData.highs, marketData.lows, marketData.prices) + }; const features = prepareFeatures(marketData, technicalData); const output = neuralEngine.forward(features); const signal = signalGenerator.generateSignal([output[0]], marketData); diff --git a/examples/apify/neural-trader-system/storage/datasets/default/000000001.json b/examples/apify/neural-trader-system/storage/datasets/default/000000001.json new file mode 100644 index 00000000..41555410 --- /dev/null +++ b/examples/apify/neural-trader-system/storage/datasets/default/000000001.json @@ -0,0 +1,32 @@ +{ + "timestamp": "2025-12-13T19:20:02.211Z", + "symbol": "BTC/USD", + "price": 245.881612493563, + "signal": "SELL", + "confidence": 100, + "reasons": [ + "Neural prediction: 1.93%", + "Patterns: double_top" + ], + "target": 233.58753186888484, + "stopLoss": 252.02865280590206, + "patterns": [ + "double_top" + ], + "technical": { + "rsi": null, + "macd": null, + "bollinger": null, + "atr": null + }, + "prediction": 0.01929405266773886, + "swarmPredictions": [ + 0.0732855217097578, + 0, + 0, + 0, + 0.02318474162893649 + ], + "timeframe": "1h", + "strategy": "ensemble" +} \ No newline at end of file diff --git a/examples/apify/neural-trader-system/test-analyze.json b/examples/apify/neural-trader-system/test-analyze.json new file mode 100644 index 00000000..74b8a3b7 --- /dev/null +++ b/examples/apify/neural-trader-system/test-analyze.json @@ -0,0 +1,5 @@ +{ + "mode": "analyze", + "symbols": ["AAPL"], + "dataSource": "yahoo" +} diff --git a/examples/apify/neural-trader-system/test-backtest.json b/examples/apify/neural-trader-system/test-backtest.json new file mode 100644 index 00000000..38e05a52 --- /dev/null +++ b/examples/apify/neural-trader-system/test-backtest.json @@ -0,0 +1,7 @@ +{ + "mode": "backtest", + "symbols": ["AAPL"], + "initialCapital": 100000, + "monteCarloRuns": 100, + "strategy": "ensemble" +}