fix(neural-trader): Add missing technical indicators to live mode

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
rUv 2025-12-13 19:27:13 +00:00
parent e7b36d1ac2
commit ee8c199fe5
12 changed files with 705 additions and 60 deletions

View file

@ -1,11 +1,11 @@
{
"actorSpecification": 1,
"name": "ai-synthetic-data-generator",
"title": "AI Synthetic Data Generator - Enterprise Mock Data with MCP & Actor Integration",
"description": "Generate unlimited, high-quality synthetic data with TRM/SONA self-learning. NEW: MCP server for AI agent integration + one-click data from Google Maps, Instagram, TikTok, Amazon scrapers. 6 use-case templates (Lead Intelligence, Competitor Monitor, Support Knowledge). Simulate Bloomberg terminals, medical records, supply chains, e-commerce, and more.",
"seoTitle": "AI Synthetic Data Generator - MCP Server & Actor Integration for RAG",
"seoDescription": "Generate synthetic data with MCP server support. One-click integration with Google Maps, Instagram, TikTok, Amazon scrapers. 6 pre-built templates for Lead Intelligence, Competitor Monitor, Support RAG. TRM/SONA self-learning.",
"version": "2.2",
"title": "AI Synthetic Data Generator - ONNX Embeddings, MCP & 21 Actor Integrations",
"description": "Generate unlimited synthetic data with ONNX-powered semantic embeddings (all-MiniLM-L6-v2, bge-small). 21 actor integrations (Reddit, Yelp, TripAdvisor, Zillow, Booking.com + more). 12 use-case templates. TRM/SONA self-learning. MCP server support.",
"seoTitle": "AI Synthetic Data Generator - ONNX Embeddings & 21 Actor Integrations",
"seoDescription": "Generate synthetic data with ONNX semantic embeddings. 21 actor integrations (Google Maps, Reddit, Yelp, Zillow). 12 templates. TRM/SONA self-learning.",
"version": "2.5",
"buildTag": "latest",
"minMemoryMbytes": 256,
"maxMemoryMbytes": 4096,

View file

@ -1,6 +1,6 @@
{
"title": "AI Synthetic Data Generator",
"description": "Generate unlimited, high-quality synthetic data for testing scrapers, training AI models, and building agentic applications. 18 data types including e-commerce, Bloomberg terminal, medical records, supply chain, and more. Now with one-click Apify actor integration and use case templates. Powered by TRM/SONA self-learning.",
"title": "AI Synthetic Data Generator v2.5",
"description": "Generate unlimited synthetic data with ONNX-powered semantic embeddings (all-MiniLM-L6-v2, bge-small). 21 actor integrations (Reddit, Yelp, TripAdvisor, Zillow, Booking.com + more). 12 use-case templates. TRM/SONA self-learning. MCP server support.",
"type": "object",
"schemaVersion": 1,
"properties": {
@ -58,7 +58,7 @@
"integrateActorId": {
"title": "Apify Actor to Integrate",
"type": "string",
"description": "Actor ID to pull data from (e.g., apify/google-maps-scraper). Used in 'integrate' mode.",
"description": "Actor ID to pull data from. 21 actors supported. Used in 'integrate' mode.",
"editor": "select",
"enum": [
"apify/google-maps-scraper",
@ -73,7 +73,15 @@
"apify/website-content-crawler",
"apify/cheerio-scraper",
"apify/news-scraper",
"apify/linkedin-scraper"
"apify/linkedin-scraper",
"trudax/tripadvisor-scraper",
"maxcopell/yelp-scraper",
"trudax/booking-scraper",
"petr_cermak/zillow-scraper",
"epctex/craigslist-scraper",
"apify/reddit-scraper",
"apify/facebook-posts-scraper",
"compass/google-places-api"
],
"enumTitles": [
"Google Maps Scraper - Local business data",
@ -88,10 +96,18 @@
"Website Content Crawler - Full site content",
"Cheerio Scraper - Structured extraction",
"News Scraper - News articles",
"LinkedIn Scraper - Jobs, profiles"
"LinkedIn Scraper - Jobs, profiles",
"TripAdvisor Scraper - Reviews, restaurants, hotels",
"Yelp Scraper - Business reviews, ratings",
"Booking.com Scraper - Hotels, accommodations",
"Zillow Scraper - Real estate listings",
"Craigslist Scraper - Classifieds, listings",
"Reddit Scraper - Posts, comments, subreddits",
"Facebook Posts Scraper - Posts, engagement",
"Google Places API - Places, reviews"
],
"sectionCaption": "Actor Integration",
"sectionDescription": "One-click integration with top Apify actors for RAG/memory use cases"
"sectionCaption": "Actor Integration (21 Actors)",
"sectionDescription": "One-click integration with 21 top Apify actors for RAG/memory use cases"
},
"integrateRunId": {
"title": "Run ID",
@ -116,18 +132,24 @@
"useTemplate": {
"title": "Use Case Template",
"type": "string",
"description": "Pre-built template for common use cases. Used in 'template' mode.",
"description": "Pre-built template for common use cases. 12 templates available. Used in 'template' mode.",
"editor": "select",
"enum": ["lead-intelligence", "competitor-monitor", "support-knowledge", "research-assistant", "content-library", "product-catalog"],
"enum": ["lead-intelligence", "competitor-monitor", "support-knowledge", "research-assistant", "content-library", "product-catalog", "review-aggregator", "price-tracker", "social-listening", "talent-sourcing", "real-estate-intel", "travel-planner"],
"enumTitles": [
"Lead Intelligence - Sales teams memorizing prospect data",
"Competitor Monitor - Track competitor mentions/changes",
"Support Knowledge - Customer support RAG system",
"Research Assistant - Academic/market research",
"Content Library - Content creators' reference",
"Product Catalog - E-commerce product memory"
"Product Catalog - E-commerce product memory",
"Review Aggregator - Aggregate reviews from multiple platforms",
"Price Tracker - Monitor prices for competitive intelligence",
"Social Listening - Monitor social conversations and trends",
"Talent Sourcing - Recruit candidates from job platforms",
"Real Estate Intel - Property market analysis",
"Travel Planner - Hotels, restaurants, activities data"
],
"sectionCaption": "Use Case Templates",
"sectionCaption": "Use Case Templates (12 Templates)",
"sectionDescription": "Pre-configured templates for common data workflows"
},
"schema": {
@ -183,7 +205,30 @@
"title": "Generate Embeddings",
"type": "boolean",
"description": "Generate vector embeddings for all output records (useful for RAG systems)",
"default": false
"default": false,
"sectionCaption": "ONNX Embeddings",
"sectionDescription": "Semantic embeddings powered by ONNX Runtime (Hugging Face Transformers.js)"
},
"useOnnxEmbeddings": {
"title": "Use ONNX Semantic Embeddings",
"type": "boolean",
"description": "Use real semantic embeddings via ONNX (slower but more accurate) vs random vectors (fast for testing)",
"default": true
},
"embeddingModel": {
"title": "ONNX Embedding Model",
"type": "string",
"description": "Choose embedding model. Smaller models are faster, larger models are more accurate.",
"editor": "select",
"default": "all-MiniLM-L6-v2",
"enum": ["all-MiniLM-L6-v2", "bge-small-en-v1.5", "all-mpnet-base-v2", "e5-small-v2", "gte-small"],
"enumTitles": [
"all-MiniLM-L6-v2 (384d, Fast, Good) - Recommended",
"bge-small-en-v1.5 (384d, Fast, Excellent)",
"all-mpnet-base-v2 (768d, Medium, Excellent)",
"e5-small-v2 (384d, Fast, Very Good)",
"gte-small (384d, Fast, Very Good)"
]
},
"simulationMode": {
"title": "Long-Running Simulation",

View file

@ -1,24 +1,24 @@
# AI Synthetic Data Generator - MCP Server & Actor Integration
# AI Synthetic Data Generator - ONNX Embeddings, MCP & 21 Actor Integrations
**Generate unlimited synthetic data** grounded in real-world patterns. **One-click integration** with 13 popular Apify web scrapers (Google Maps, Instagram, TikTok, Amazon, LinkedIn) lets you transform real scraped data into AI-ready formats for RAG systems, agent memory, and model training.
**Generate unlimited synthetic data** with **ONNX-powered semantic embeddings** (all-MiniLM-L6-v2, bge-small). **One-click integration** with 21 popular Apify web scrapers (Google Maps, Instagram, TikTok, Amazon, LinkedIn, Reddit, Yelp, TripAdvisor, Zillow, Booking.com + more). 12 use-case templates. TRM/SONA self-learning. MCP server support.
**Why grounding matters:** Pure synthetic data can drift from reality. By integrating with live Apify scrapers, your synthetic data inherits real naming conventions, price distributions, engagement patterns, and business characteristics - making your AI models and tests far more realistic.
[![Apify Actor](https://img.shields.io/badge/Apify-Actor-blue)](https://apify.com/ruv/ai-synthetic-data-generator)
[![MCP Server](https://img.shields.io/badge/MCP-Server-purple)](https://modelcontextprotocol.io)
[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
[![Version](https://img.shields.io/badge/version-2.2-green)](https://github.com/ruvnet/ruvector)
[![Version](https://img.shields.io/badge/version-2.5-green)](https://github.com/ruvnet/ruvector)
## What's New in v2.2
## What's New in v2.5
| Feature | Description |
|---------|-------------|
| **One-Click Scraper Integration** | Ground synthetic data with real patterns from Google Maps, Instagram, TikTok, YouTube, Amazon, LinkedIn, and 7 more scrapers |
| **ONNX Semantic Embeddings** | Real semantic embeddings via Hugging Face Transformers.js (all-MiniLM-L6-v2, bge-small, all-mpnet-base-v2, e5-small, gte-small) |
| **21 Actor Integrations** | Added Reddit, Yelp, TripAdvisor, Zillow, Booking.com, Craigslist, Facebook, Google Places |
| **12 Use Case Templates** | Added Review Aggregator, Price Tracker, Social Listening, Talent Sourcing, Real Estate Intel, Travel Planner |
| **MCP Server** | Use as AI agent tool (Claude, GPT) via Model Context Protocol |
| **6 Use Case Templates** | Lead Intelligence, Competitor Monitor, Support RAG, and more |
| **Enhanced Grounding** | Transform scraped data into AI-ready formats that preserve real-world distributions |
| **Webhook Support** | POST results to your endpoint for async workflows |
| **Output Formats** | JSON, JSONL, CSV export options |
| **TRM/SONA Self-Learning** | 7M parameter recursive reasoning with 3-tier learning |
| **Output Formats** | JSON, JSONL, CSV export with webhook support |
---
@ -26,9 +26,10 @@
| Feature | Description |
|---------|-------------|
| **Scraper Integration** | One-click grounding with 13 popular Apify scrapers for realistic data |
| **ONNX Embeddings** | 5 semantic embedding models via Transformers.js for RAG-ready vectors |
| **21 Actor Integrations** | One-click grounding with 21 popular Apify scrapers |
| **12 Templates** | Pre-built workflows for common use cases |
| **MCP Server** | Integrate with Claude Code, GPT, and AI agents |
| **6 Templates** | Pre-built workflows for common use cases |
| **TRM** | 7M parameter recursive reasoning (83% on GSM8K) |
| **SONA** | 3-tier self-learning (Instant/Background/Deep) |
| **EWC++** | Pattern preservation across generations (lambda=2000) |
@ -92,7 +93,35 @@ Once connected, you get these tools in your AI agent:
---
## One-Click Actor Integration
## ONNX Semantic Embeddings
Generate real semantic embeddings for RAG systems using ONNX Runtime and Hugging Face Transformers.js.
### Available Models
| Model | Dimensions | Speed | Quality | Use Case |
|-------|------------|-------|---------|----------|
| **all-MiniLM-L6-v2** | 384 | Fast | Good | General purpose (recommended) |
| **bge-small-en-v1.5** | 384 | Fast | Excellent | High-quality retrieval |
| **all-mpnet-base-v2** | 768 | Medium | Excellent | Maximum accuracy |
| **e5-small-v2** | 384 | Fast | Very Good | Balanced performance |
| **gte-small** | 384 | Fast | Very Good | Efficient retrieval |
### Example: Generate with ONNX Embeddings
```json
{
"dataType": "ecommerce",
"count": 100,
"generateEmbeddings": true,
"useOnnxEmbeddings": true,
"embeddingModel": "all-MiniLM-L6-v2"
}
```
---
## One-Click Actor Integration (21 Actors)
Transform data from popular Apify scrapers into AI-ready format.
@ -113,6 +142,14 @@ Transform data from popular Apify scrapers into AI-ready format.
| **apify/web-scraper** | General | Any web page data |
| **apify/cheerio-scraper** | General | Structured extraction |
| **apify/news-scraper** | News | Articles, authors, sources |
| **trudax/tripadvisor-scraper** | Reviews | Hotels, restaurants, reviews |
| **maxcopell/yelp-scraper** | Reviews | Business reviews, ratings |
| **trudax/booking-scraper** | Travel | Hotels, accommodations, prices |
| **petr_cermak/zillow-scraper** | Real Estate | Property listings, prices |
| **epctex/craigslist-scraper** | Classifieds | Listings, local posts |
| **apify/reddit-scraper** | Social Media | Posts, comments, subreddits |
| **apify/facebook-posts-scraper** | Social Media | Posts, engagement |
| **compass/google-places-api** | Local Business | Places, reviews, details |
### Example: Integrate Google Maps Data
@ -128,7 +165,7 @@ Transform data from popular Apify scrapers into AI-ready format.
---
## Use Case Templates
## Use Case Templates (12 Templates)
One-click deployment for common AI/RAG scenarios.
@ -140,6 +177,12 @@ One-click deployment for common AI/RAG scenarios.
| **research-assistant** | Academic/market research | Researchers | Google Search, News, Content |
| **content-library** | Content creators' reference | Creators | Instagram, TikTok, YouTube |
| **product-catalog** | E-commerce product memory | E-commerce | Amazon, Shopify, Google Maps |
| **review-aggregator** | Aggregate reviews from platforms | Product Managers | TripAdvisor, Yelp, Google Maps |
| **price-tracker** | Monitor prices across sites | Pricing Teams | Amazon, Zillow, Booking.com |
| **social-listening** | Monitor social conversations | Social Media, PR | Reddit, Twitter, Facebook |
| **talent-sourcing** | Recruit from job platforms | Recruiters, HR | LinkedIn, Craigslist |
| **real-estate-intel** | Property market analysis | Real Estate, Investors | Zillow, Google Maps, Craigslist |
| **travel-planner** | Hotels, restaurants, activities | Travel Agents | TripAdvisor, Booking.com, Google Maps |
### Example: Lead Intelligence Template

View file

@ -1,7 +1,7 @@
{
"name": "agentic-synth-apify-actor",
"version": "2.2.0",
"description": "AI Synthetic Data Generator with TRM/SONA self-learning, MCP server, and Apify actor integrations",
"version": "2.5.0",
"description": "AI Synthetic Data Generator with ONNX embeddings, TRM/SONA self-learning, MCP server, and 21 Apify actor integrations",
"main": "src/main.js",
"type": "module",
"bin": {
@ -19,6 +19,7 @@
"@google/generative-ai": "^0.24.1",
"@modelcontextprotocol/sdk": "^1.0.0",
"@ruvector/ruvllm": "^0.2.3",
"@xenova/transformers": "^2.17.2",
"apify": "^3.5.2"
},
"optionalDependencies": {

View file

@ -0,0 +1,96 @@
/**
* ONNX-based Embeddings Module
* Uses @xenova/transformers (Hugging Face Transformers.js) for ONNX Runtime embeddings
*/
import { log } from 'apify';
let embeddingPipeline = null;
let currentModel = null;
export const EMBEDDING_MODELS = {
'all-MiniLM-L6-v2': { id: 'Xenova/all-MiniLM-L6-v2', dimensions: 384, speed: 'fast', quality: 'good' },
'bge-small-en-v1.5': { id: 'Xenova/bge-small-en-v1.5', dimensions: 384, speed: 'fast', quality: 'excellent' },
'all-mpnet-base-v2': { id: 'Xenova/all-mpnet-base-v2', dimensions: 768, speed: 'medium', quality: 'excellent' },
'e5-small-v2': { id: 'Xenova/e5-small-v2', dimensions: 384, speed: 'fast', quality: 'very-good' },
'gte-small': { id: 'Xenova/gte-small', dimensions: 384, speed: 'fast', quality: 'very-good' }
};
export async function initEmbeddingPipeline(modelName = 'all-MiniLM-L6-v2') {
const modelConfig = EMBEDDING_MODELS[modelName];
if (!modelConfig) throw new Error(`Unknown model: ${modelName}`);
if (embeddingPipeline && currentModel === modelName) return embeddingPipeline;
const { pipeline } = await import('@xenova/transformers');
log.info(`Loading ONNX embedding model: ${modelConfig.id}...`);
embeddingPipeline = await pipeline('feature-extraction', modelConfig.id, { quantized: true });
currentModel = modelName;
return embeddingPipeline;
}
export async function generateEmbedding(text, options = {}) {
const { modelName = 'all-MiniLM-L6-v2', normalize = true } = options;
const pipe = await initEmbeddingPipeline(modelName);
const output = await pipe(text.substring(0, 8000), { pooling: 'mean', normalize });
return Array.from(output.data);
}
export async function generateEmbeddingsBatch(texts, options = {}) {
const { modelName = 'all-MiniLM-L6-v2', batchSize = 32, onProgress = null } = options;
const pipe = await initEmbeddingPipeline(modelName);
const embeddings = [];
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize);
const batchEmbeddings = await Promise.all(
batch.map(async (text) => {
const output = await pipe((text || '').substring(0, 8000), { pooling: 'mean', normalize: true });
return Array.from(output.data);
})
);
embeddings.push(...batchEmbeddings);
if (onProgress) onProgress({ processed: Math.min(i + batchSize, texts.length), total: texts.length });
}
return embeddings;
}
export async function addEmbeddingsToRecords(records, options = {}) {
const { modelName = 'all-MiniLM-L6-v2', textFields = ['title', 'description', 'text', 'content', 'caption', 'body', 'name'] } = options;
if (!records?.length) return records;
const modelConfig = EMBEDDING_MODELS[modelName];
log.info(`Generating ONNX embeddings for ${records.length} records with ${modelName}`);
const texts = records.map(record => {
const parts = textFields.map(f => {
const v = record[f] || record.data?.[f];
return typeof v === 'string' ? v : Array.isArray(v) ? v.join(' ') : '';
}).filter(Boolean);
return parts.join(' ') || 'empty';
});
const embeddings = await generateEmbeddingsBatch(texts, { modelName });
return records.map((record, i) => ({ ...record, embedding: embeddings[i], embeddingModel: modelName, embeddingDimensions: modelConfig.dimensions }));
}
export function generateRandomEmbedding(dimensions, random = Math.random) {
const embedding = [];
let norm = 0;
for (let i = 0; i < dimensions; i++) {
const val = random() * 2 - 1;
embedding.push(val);
norm += val * val;
}
norm = Math.sqrt(norm);
return embedding.map(v => Math.round((v / norm) * 1000000) / 1000000);
}
export function cosineSimilarity(a, b) {
let dot = 0, normA = 0, normB = 0;
for (let i = 0; i < a.length; i++) {
dot += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}

View file

@ -98,6 +98,62 @@ export const SUPPORTED_ACTORS = {
category: 'professional',
defaultFields: ['title', 'company', 'location', 'description', 'salary'],
transform: transformLinkedIn
},
// Reviews & Local
'trudax/tripadvisor-scraper': {
name: 'TripAdvisor Scraper',
category: 'reviews',
defaultFields: ['name', 'rating', 'reviewCount', 'address', 'priceLevel', 'cuisine'],
transform: transformTripAdvisor
},
'maxcopell/yelp-scraper': {
name: 'Yelp Scraper',
category: 'reviews',
defaultFields: ['name', 'rating', 'reviewCount', 'address', 'categories', 'phone'],
transform: transformYelp
},
'trudax/booking-scraper': {
name: 'Booking.com Scraper',
category: 'travel',
defaultFields: ['name', 'rating', 'price', 'location', 'amenities', 'reviewScore'],
transform: transformBooking
},
// Real Estate
'petr_cermak/zillow-scraper': {
name: 'Zillow Scraper',
category: 'real-estate',
defaultFields: ['address', 'price', 'bedrooms', 'bathrooms', 'sqft', 'propertyType'],
transform: transformZillow
},
'epctex/craigslist-scraper': {
name: 'Craigslist Scraper',
category: 'classifieds',
defaultFields: ['title', 'price', 'location', 'description', 'category', 'postedAt'],
transform: transformCraigslist
},
// Social Platforms
'apify/reddit-scraper': {
name: 'Reddit Scraper',
category: 'social-media',
defaultFields: ['title', 'text', 'subreddit', 'score', 'comments', 'author'],
transform: transformReddit
},
'apify/facebook-posts-scraper': {
name: 'Facebook Posts Scraper',
category: 'social-media',
defaultFields: ['text', 'likes', 'comments', 'shares', 'author', 'timestamp'],
transform: transformFacebook
},
// Places & Maps
'compass/google-places-api': {
name: 'Google Places API',
category: 'local-business',
defaultFields: ['name', 'rating', 'address', 'phone', 'website', 'types', 'priceLevel'],
transform: transformGooglePlaces
}
};
@ -217,6 +273,128 @@ export const USE_CASE_TEMPLATES = {
recommendedAlternatives: 'array<string>',
embedding: 'array<number>'
}
},
'review-aggregator': {
name: 'Review Aggregator',
description: 'Aggregate and analyze reviews from multiple platforms',
targetUsers: ['Product Managers', 'Brand Managers', 'Customer Experience'],
suggestedActors: ['trudax/tripadvisor-scraper', 'maxcopell/yelp-scraper', 'apify/google-maps-scraper'],
memorizeFields: ['name', 'rating', 'reviewCount', 'text', 'sentiment', 'categories'],
enrichWith: ['sentiment_score', 'common_themes', 'rating_trend'],
outputFormat: {
entityId: 'string',
name: 'string',
averageRating: 'number',
totalReviews: 'number',
platforms: 'array<object>',
sentimentAnalysis: 'object',
topPraises: 'array<string>',
topComplaints: 'array<string>',
embedding: 'array<number>'
}
},
'price-tracker': {
name: 'Price Tracker',
description: 'Monitor prices across platforms for competitive intelligence',
targetUsers: ['Pricing Teams', 'Buyers', 'Resellers'],
suggestedActors: ['apify/amazon-scraper', 'petr_cermak/zillow-scraper', 'trudax/booking-scraper'],
memorizeFields: ['title', 'price', 'currency', 'availability', 'seller', 'timestamp'],
enrichWith: ['price_history', 'price_alerts', 'competitor_comparison'],
outputFormat: {
productId: 'string',
title: 'string',
currentPrice: 'number',
priceHistory: 'array<object>',
lowestPrice: 'number',
highestPrice: 'number',
priceChange: 'number',
competitors: 'array<object>',
embedding: 'array<number>'
}
},
'social-listening': {
name: 'Social Listening',
description: 'Monitor social conversations about brands, topics, and trends',
targetUsers: ['Social Media Managers', 'PR Teams', 'Brand Managers'],
suggestedActors: ['apify/reddit-scraper', 'apify/twitter-scraper', 'apify/facebook-posts-scraper'],
memorizeFields: ['text', 'author', 'engagement', 'sentiment', 'platform', 'timestamp'],
enrichWith: ['sentiment_analysis', 'influencer_score', 'viral_potential'],
outputFormat: {
postId: 'string',
platform: 'string',
content: 'string',
author: 'object',
engagement: 'object',
sentiment: 'number (-1 to 1)',
mentions: 'array<string>',
hashtags: 'array<string>',
viralScore: 'number (1-100)',
embedding: 'array<number>'
}
},
'talent-sourcing': {
name: 'Talent Sourcing',
description: 'Recruit and source candidates from job platforms',
targetUsers: ['Recruiters', 'HR Teams', 'Talent Acquisition'],
suggestedActors: ['apify/linkedin-scraper', 'epctex/craigslist-scraper'],
memorizeFields: ['title', 'company', 'location', 'skills', 'experience', 'salary'],
enrichWith: ['skill_match', 'culture_fit', 'availability'],
outputFormat: {
candidateId: 'string',
name: 'string',
currentRole: 'string',
company: 'string',
skills: 'array<string>',
experience: 'number',
location: 'string',
matchScore: 'number (1-100)',
embedding: 'array<number>'
}
},
'real-estate-intel': {
name: 'Real Estate Intelligence',
description: 'Market analysis and property intelligence for real estate',
targetUsers: ['Real Estate Agents', 'Investors', 'Property Managers'],
suggestedActors: ['petr_cermak/zillow-scraper', 'apify/google-maps-scraper', 'epctex/craigslist-scraper'],
memorizeFields: ['address', 'price', 'sqft', 'bedrooms', 'bathrooms', 'propertyType'],
enrichWith: ['market_trends', 'comparable_sales', 'neighborhood_score'],
outputFormat: {
propertyId: 'string',
address: 'string',
price: 'number',
pricePerSqft: 'number',
propertyType: 'string',
specs: 'object',
marketAnalysis: 'object',
comparables: 'array<object>',
investmentScore: 'number (1-100)',
embedding: 'array<number>'
}
},
'travel-planner': {
name: 'Travel Planner',
description: 'Plan trips with aggregated hotel, restaurant, and activity data',
targetUsers: ['Travel Agents', 'Travelers', 'Tourism Boards'],
suggestedActors: ['trudax/tripadvisor-scraper', 'trudax/booking-scraper', 'apify/google-maps-scraper'],
memorizeFields: ['name', 'rating', 'price', 'location', 'amenities', 'reviews'],
enrichWith: ['booking_availability', 'best_time_to_visit', 'local_tips'],
outputFormat: {
placeId: 'string',
name: 'string',
type: 'string (hotel, restaurant, attraction)',
rating: 'number',
priceRange: 'string',
location: 'object',
highlights: 'array<string>',
reviews: 'array<object>',
embedding: 'array<number>'
}
}
};
@ -515,6 +693,225 @@ function transformLinkedIn(item) {
};
}
function transformTripAdvisor(item) {
return {
id: item.id || item.locationId || generateId(),
source: 'tripadvisor',
name: item.name || item.title,
type: item.type || item.category,
rating: item.rating || item.overallRating,
reviewCount: item.reviewCount || item.numberOfReviews,
priceLevel: item.priceLevel || item.priceRange,
address: {
full: item.address || item.addressObj?.street1,
city: item.city || item.addressObj?.city,
country: item.country || item.addressObj?.country
},
cuisine: item.cuisine || item.cuisines || [],
features: item.features || item.amenities || [],
awards: item.awards || [],
photos: (item.photos || []).slice(0, 5).map(p => p.url || p),
url: item.url || item.webUrl,
scrapedAt: item.scrapedAt || new Date().toISOString()
};
}
function transformYelp(item) {
return {
id: item.id || item.businessId || generateId(),
source: 'yelp',
name: item.name || item.businessName,
rating: item.rating,
reviewCount: item.reviewCount || item.review_count,
priceLevel: item.price || item.priceRange,
address: {
full: item.address || item.location?.display_address?.join(', '),
street: item.location?.address1,
city: item.location?.city,
state: item.location?.state,
zip: item.location?.zip_code
},
phone: item.phone || item.display_phone,
categories: (item.categories || []).map(c => c.title || c),
hours: item.hours || item.businessHours,
photos: (item.photos || []).slice(0, 5),
isClaimed: item.is_claimed,
url: item.url,
scrapedAt: item.scrapedAt || new Date().toISOString()
};
}
function transformBooking(item) {
return {
id: item.id || item.hotelId || generateId(),
source: 'booking',
name: item.name || item.hotelName,
type: item.type || item.accommodationType || 'hotel',
rating: item.rating || item.reviewScore,
reviewScore: item.reviewScore || item.score,
reviewCount: item.reviewCount || item.numberOfReviews,
stars: item.stars || item.starRating,
price: {
amount: item.price || item.priceAmount,
currency: item.currency || 'USD',
perNight: item.pricePerNight || item.price
},
location: {
address: item.address,
city: item.city,
country: item.country,
lat: item.latitude || item.location?.lat,
lng: item.longitude || item.location?.lng
},
amenities: item.amenities || item.facilities || [],
photos: (item.photos || []).slice(0, 5).map(p => p.url || p),
url: item.url,
scrapedAt: item.scrapedAt || new Date().toISOString()
};
}
function transformZillow(item) {
return {
id: item.zpid || item.id || generateId(),
source: 'zillow',
address: {
full: item.address || item.streetAddress,
street: item.streetAddress,
city: item.city,
state: item.state,
zip: item.zipcode
},
price: item.price || item.zestimate,
zestimate: item.zestimate,
rentZestimate: item.rentZestimate,
propertyType: item.homeType || item.propertyType,
status: item.homeStatus || item.status,
specs: {
bedrooms: item.bedrooms || item.beds,
bathrooms: item.bathrooms || item.baths,
sqft: item.livingArea || item.sqft,
lotSize: item.lotSize || item.lotAreaValue,
yearBuilt: item.yearBuilt
},
features: item.resoFacts?.atAGlanceFacts || [],
priceHistory: item.priceHistory || [],
taxHistory: item.taxHistory || [],
photos: (item.photos || item.hiResImageLink || []).slice(0, 5),
url: item.url || item.hdpUrl,
scrapedAt: item.scrapedAt || new Date().toISOString()
};
}
function transformCraigslist(item) {
return {
id: item.id || item.postId || generateId(),
source: 'craigslist',
title: item.title || item.postTitle,
price: item.price,
category: item.category || item.section,
subcategory: item.subcategory,
location: {
area: item.location || item.hood,
city: item.city,
region: item.region
},
description: item.description || item.body,
attributes: item.attributes || {},
images: (item.images || item.pics || []).slice(0, 5),
postedAt: item.datetime || item.postedAt,
updatedAt: item.updated,
url: item.url || item.postUrl,
scrapedAt: item.scrapedAt || new Date().toISOString()
};
}
function transformReddit(item) {
return {
id: item.id || item.postId || generateId(),
source: 'reddit',
type: item.type || (item.isSelf ? 'text' : 'link'),
title: item.title,
text: item.selftext || item.body || item.text,
subreddit: item.subreddit || item.subredditName,
author: {
username: item.author || item.authorName,
id: item.authorId
},
engagement: {
score: item.score || item.ups - (item.downs || 0),
upvotes: item.ups,
downvotes: item.downs,
comments: item.numComments || item.num_comments,
awards: item.totalAwards || item.total_awards_received
},
flair: item.linkFlair || item.link_flair_text,
nsfw: item.over18 || item.over_18 || false,
spoiler: item.spoiler || false,
url: item.url || `https://reddit.com${item.permalink}`,
mediaUrl: item.mediaUrl || item.url_overridden_by_dest,
createdAt: item.created || item.createdUtc,
scrapedAt: item.scrapedAt || new Date().toISOString()
};
}
function transformFacebook(item) {
return {
id: item.id || item.postId || generateId(),
source: 'facebook',
type: item.type || 'post',
text: item.text || item.message || item.content,
author: {
name: item.authorName || item.user?.name,
id: item.authorId || item.user?.id,
url: item.authorUrl || item.user?.url
},
engagement: {
likes: item.likes || item.likesCount,
comments: item.comments || item.commentsCount,
shares: item.shares || item.sharesCount,
reactions: item.reactions || {}
},
media: {
images: item.images || [],
videos: item.videos || [],
links: item.links || []
},
hashtags: extractHashtags(item.text || item.message),
mentions: extractMentions(item.text || item.message),
timestamp: item.time || item.timestamp || item.createdAt,
url: item.url || item.postUrl,
scrapedAt: item.scrapedAt || new Date().toISOString()
};
}
function transformGooglePlaces(item) {
return {
id: item.place_id || item.placeId || generateId(),
source: 'google-places',
name: item.name,
rating: item.rating,
reviewCount: item.user_ratings_total || item.reviewCount,
priceLevel: item.price_level || item.priceLevel,
address: item.formatted_address || item.address,
phone: item.formatted_phone_number || item.phone,
website: item.website,
types: item.types || [],
location: {
lat: item.geometry?.location?.lat || item.lat,
lng: item.geometry?.location?.lng || item.lng
},
hours: item.opening_hours || item.hours,
photos: (item.photos || []).slice(0, 5).map(p => p.photo_reference || p),
reviews: (item.reviews || []).slice(0, 5).map(r => ({
text: r.text,
rating: r.rating,
author: r.author_name
})),
url: item.url,
scrapedAt: item.scrapedAt || new Date().toISOString()
};
}
// ============================================
// UTILITY FUNCTIONS
// ============================================

View file

@ -2,6 +2,7 @@ import { Actor, log } from 'apify';
import { GoogleGenerativeAI } from '@google/generative-ai';
import { createRequire } from 'module';
import { integrateActorData, SUPPORTED_ACTORS, USE_CASE_TEMPLATES, getTemplate, listSupportedActors, listTemplates } from './integrations.js';
import { addEmbeddingsToRecords, generateRandomEmbedding, EMBEDDING_MODELS } from './embeddings.js';
// CJS import workaround for RuvLLM native extension
const require = createRequire(import.meta.url);
@ -58,10 +59,13 @@ try {
sonaEnabled = true,
ewcLambda = 2000,
patternThreshold = 0.7,
sonaLearningTiers = ['instant', 'background']
sonaLearningTiers = ['instant', 'background'],
// ONNX Embedding parameters
useOnnxEmbeddings = true,
embeddingModel = 'all-MiniLM-L6-v2'
} = input;
log.info('AI Synthetic Data Generator v2.2 with TRM/SONA', { mode, dataType, count, provider, model, sonaEnabled });
log.info('AI Synthetic Data Generator v2.5 with ONNX Embeddings & TRM/SONA', { mode, dataType, count, provider, model, sonaEnabled, useOnnxEmbeddings, embeddingModel });
// Initialize SONA if available and enabled
if (ruvllm && sonaEnabled) {
@ -274,33 +278,42 @@ try {
// EMBEDDING GENERATION (optional)
// ============================================
if (generateEmbeddings && generatedData.length > 0) {
log.info(`Generating embeddings with ${embeddingDimensions} dimensions...`);
const modelConfig = EMBEDDING_MODELS[embeddingModel] || EMBEDDING_MODELS['all-MiniLM-L6-v2'];
const effectiveDimensions = useOnnxEmbeddings ? modelConfig.dimensions : embeddingDimensions;
const random = createSeededRandom(seed);
generatedData = generatedData.map((item, idx) => {
// Generate normalized random embedding
const embedding = [];
let norm = 0;
for (let j = 0; j < embeddingDimensions; j++) {
const val = random() * 2 - 1;
embedding.push(val);
norm += val * val;
}
norm = Math.sqrt(norm);
for (let j = 0; j < embeddingDimensions; j++) {
embedding[j] = Math.round((embedding[j] / norm) * 1000000) / 1000000;
}
return {
...item,
embedding,
embeddingDimensions
};
log.info(`Generating embeddings with ${effectiveDimensions} dimensions...`, {
useOnnx: useOnnxEmbeddings,
model: useOnnxEmbeddings ? embeddingModel : 'random'
});
if (useOnnxEmbeddings) {
// Use ONNX-powered semantic embeddings
try {
generatedData = await addEmbeddingsToRecords(generatedData, { modelName: embeddingModel });
log.info(`Added ONNX embeddings using ${embeddingModel} model`);
await Actor.charge({ eventName: 'onnx-embedding-generation', count: generatedData.length });
} catch (e) {
log.warning(`ONNX embedding failed: ${e.message}. Falling back to random embeddings.`);
// Fall back to random embeddings
const random = createSeededRandom(seed);
generatedData = generatedData.map((item) => ({
...item,
embedding: generateRandomEmbedding(effectiveDimensions, random),
embeddingModel: 'random',
embeddingDimensions: effectiveDimensions
}));
}
} else {
// Use random embeddings (faster, for testing)
const random = createSeededRandom(seed);
generatedData = generatedData.map((item) => ({
...item,
embedding: generateRandomEmbedding(effectiveDimensions, random),
embeddingModel: 'random',
embeddingDimensions: effectiveDimensions
}));
}
// Charge for embedding generation
await Actor.charge({ eventName: 'embedding-generation', count: generatedData.length });
log.info(`Added embeddings to ${generatedData.length} records`);
@ -476,7 +489,7 @@ try {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'User-Agent': 'Apify-AI-Synthetic-Data-Generator/2.2'
'User-Agent': 'Apify-AI-Synthetic-Data-Generator/2.5'
},
body: JSON.stringify(webhookPayload)
});

View file

@ -0,0 +1 @@
{"mode":"backtest","symbols":["AAPL"]}

View file

@ -1371,7 +1371,12 @@ await Actor.main(async () => {
const signals = [];
for (const symbol of symbols) {
const marketData = generateMarketData(symbol, lookbackPeriod, { stopLoss, takeProfit, timeframe });
const technicalData = { rsi: TechnicalIndicators.calculateRSI(marketData.prices), macd: TechnicalIndicators.calculateMACD(marketData.prices) };
const technicalData = {
rsi: TechnicalIndicators.calculateRSI(marketData.prices),
macd: TechnicalIndicators.calculateMACD(marketData.prices),
bollinger: TechnicalIndicators.calculateBollinger(marketData.prices),
atr: TechnicalIndicators.calculateATR(marketData.highs, marketData.lows, marketData.prices)
};
const features = prepareFeatures(marketData, technicalData);
const output = neuralEngine.forward(features);
const signal = signalGenerator.generateSignal([output[0]], marketData);

View file

@ -0,0 +1,32 @@
{
"timestamp": "2025-12-13T19:20:02.211Z",
"symbol": "BTC/USD",
"price": 245.881612493563,
"signal": "SELL",
"confidence": 100,
"reasons": [
"Neural prediction: 1.93%",
"Patterns: double_top"
],
"target": 233.58753186888484,
"stopLoss": 252.02865280590206,
"patterns": [
"double_top"
],
"technical": {
"rsi": null,
"macd": null,
"bollinger": null,
"atr": null
},
"prediction": 0.01929405266773886,
"swarmPredictions": [
0.0732855217097578,
0,
0,
0,
0.02318474162893649
],
"timeframe": "1h",
"strategy": "ensemble"
}

View file

@ -0,0 +1,5 @@
{
"mode": "analyze",
"symbols": ["AAPL"],
"dataSource": "yahoo"
}

View file

@ -0,0 +1,7 @@
{
"mode": "backtest",
"symbols": ["AAPL"],
"initialCapital": 100000,
"monteCarloRuns": 100,
"strategy": "ensemble"
}