Automate Social Media Data Collection
Manual data collection doesn't scale.
You need automation—systems that run on schedule, handle errors, and deliver clean data without babysitting.
This guide covers everything from simple cron jobs to production-grade pipelines.
What We're Building
An automated system that:
- Collects social media data on schedule
- Handles failures gracefully
- Stores data efficiently
- Alerts you when something's wrong
- Scales to thousands of profiles
Architecture Options
Simple: Cron + Script
Best for: Small projects, personal use
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Cron Job │ --> │ Script │ --> │ Database │
└─────────────┘ └─────────────┘ └─────────────┘
Medium: Task Queue
Best for: Growing projects, multiple data sources
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Scheduler │ --> │ Queue │ --> │ Workers │
└─────────────┘ │ (Redis) │ │ (Multiple) │
└─────────────┘ └─────────────┘
│
v
┌─────────────┐
│ Database │
└─────────────┘
Enterprise: Orchestrated Pipelines
Best for: Large scale, complex workflows
Airflow/Dagster → Kubernetes Jobs → API Calls → Data Lake → Transform → Warehouse
Simple Automation: Node.js + Cron
Basic Script
// collect.js
require('dotenv').config();
const Database = require('better-sqlite3');
const API_KEY = process.env.SOCIAVAULT_API_KEY;
const API_BASE = 'https://api.sociavault.com/v1/scrape';
const db = new Database('social_data.db');
// Initialize database
db.exec(`
CREATE TABLE IF NOT EXISTS profiles (
id INTEGER PRIMARY KEY,
platform TEXT,
username TEXT,
followers INTEGER,
following INTEGER,
posts INTEGER,
engagement_rate REAL,
collected_at DATETIME DEFAULT CURRENT_TIMESTAMP,
UNIQUE(platform, username, DATE(collected_at))
);
`);
async function fetchProfile(platform, username) {
const response = await fetch(
`${API_BASE}/${platform}/profile?username=${username}`,
{ headers: { 'Authorization': `Bearer ${API_KEY}` } }
);
if (!response.ok) {
throw new Error(`API error: ${response.status}`);
}
return response.json();
}
async function collectData() {
const accounts = [
{ platform: 'tiktok', username: 'charlidamelio' },
{ platform: 'tiktok', username: 'khaby.lame' },
{ platform: 'instagram', username: 'natgeo' },
{ platform: 'youtube', username: 'MrBeast' }
];
const insert = db.prepare(`
INSERT OR REPLACE INTO profiles
(platform, username, followers, following, posts, engagement_rate)
VALUES (?, ?, ?, ?, ?, ?)
`);
for (const account of accounts) {
try {
console.log(`Fetching ${account.platform}/${account.username}...`);
const result = await fetchProfile(account.platform, account.username);
const data = result.data;
insert.run(
account.platform,
account.username,
data.follower_count || data.followers,
data.following_count || data.following,
data.video_count || data.posts_count || 0,
data.engagement_rate || null
);
console.log(` ✓ ${data.follower_count || data.followers} followers`);
// Rate limiting
await new Promise(r => setTimeout(r, 500));
} catch (error) {
console.error(` ✗ Error: ${error.message}`);
}
}
console.log('Collection complete!');
}
collectData().catch(console.error);
Schedule with Cron
# Run every hour
0 * * * * cd /path/to/project && node collect.js >> logs/collect.log 2>&1
# Run every day at 6 AM
0 6 * * * cd /path/to/project && node collect.js >> logs/collect.log 2>&1
Or use node-cron for in-process scheduling:
const cron = require('node-cron');
// Run every hour
cron.schedule('0 * * * *', () => {
console.log('Starting scheduled collection...');
collectData().catch(console.error);
});
console.log('Scheduler running...');
Medium Scale: Queue-Based System
With BullMQ (Redis)
// worker.js
const { Worker, Queue } = require('bullmq');
const Redis = require('ioredis');
const connection = new Redis(process.env.REDIS_URL);
// Create queue
const profileQueue = new Queue('profiles', { connection });
// Worker to process jobs
const worker = new Worker('profiles', async (job) => {
const { platform, username } = job.data;
console.log(`Processing ${platform}/${username}`);
const result = await fetchProfile(platform, username);
await saveToDatabase(result.data);
return { success: true, followers: result.data.follower_count };
}, {
connection,
concurrency: 5, // Process 5 at a time
limiter: {
max: 10,
duration: 1000 // Max 10 jobs per second
}
});
worker.on('completed', (job, result) => {
console.log(`✓ ${job.data.username}: ${result.followers} followers`);
});
worker.on('failed', (job, error) => {
console.error(`✗ ${job.data.username}: ${error.message}`);
});
// scheduler.js
const { Queue } = require('bullmq');
const Redis = require('ioredis');
const connection = new Redis(process.env.REDIS_URL);
const profileQueue = new Queue('profiles', { connection });
async function scheduleCollection() {
const accounts = await db.all('SELECT * FROM tracked_accounts WHERE active = 1');
for (const account of accounts) {
await profileQueue.add('collect', {
platform: account.platform,
username: account.username
}, {
attempts: 3,
backoff: {
type: 'exponential',
delay: 2000
}
});
}
console.log(`Scheduled ${accounts.length} jobs`);
}
// Run on schedule
const cron = require('node-cron');
cron.schedule('0 */6 * * *', scheduleCollection); // Every 6 hours
Python Alternative
# collector.py
import os
import sqlite3
import requests
import schedule
import time
from datetime import datetime
API_KEY = os.getenv('SOCIAVAULT_API_KEY')
API_BASE = 'https://api.sociavault.com/v1/scrape'
def get_db():
conn = sqlite3.connect('social_data.db')
conn.execute('''
CREATE TABLE IF NOT EXISTS profiles (
id INTEGER PRIMARY KEY,
platform TEXT,
username TEXT,
followers INTEGER,
collected_at TEXT
)
''')
return conn
def fetch_profile(platform, username):
response = requests.get(
f'{API_BASE}/{platform}/profile',
params={'username': username},
headers={'Authorization': f'Bearer {API_KEY}'}
)
response.raise_for_status()
return response.json()
def collect_all():
accounts = [
('tiktok', 'charlidamelio'),
('instagram', 'natgeo'),
('youtube', 'MrBeast')
]
conn = get_db()
for platform, username in accounts:
try:
result = fetch_profile(platform, username)
data = result['data']
conn.execute(
'INSERT INTO profiles (platform, username, followers, collected_at) VALUES (?, ?, ?, ?)',
(platform, username, data.get('follower_count') or data.get('followers'), datetime.now().isoformat())
)
conn.commit()
print(f'✓ {platform}/{username}: {data.get("follower_count") or data.get("followers")}')
time.sleep(0.5)
except Exception as e:
print(f'✗ {platform}/{username}: {e}')
conn.close()
# Schedule
schedule.every().hour.do(collect_all)
schedule.every().day.at('06:00').do(collect_all)
print('Scheduler running...')
while True:
schedule.run_pending()
time.sleep(60)
No-Code: n8n Workflow
For those who prefer visual workflows:
{
"name": "Social Media Collector",
"nodes": [
{
"name": "Schedule Trigger",
"type": "n8n-nodes-base.scheduleTrigger",
"parameters": {
"rule": {
"interval": [{"field": "hours", "interval": 6}]
}
}
},
{
"name": "Get Accounts",
"type": "n8n-nodes-base.postgres",
"parameters": {
"operation": "executeQuery",
"query": "SELECT platform, username FROM tracked_accounts WHERE active = true"
}
},
{
"name": "Loop Over Accounts",
"type": "n8n-nodes-base.splitInBatches",
"parameters": {
"batchSize": 1
}
},
{
"name": "Fetch Profile",
"type": "n8n-nodes-base.httpRequest",
"parameters": {
"url": "=https://api.sociavault.com/v1/scrape/{{ $json.platform }}/profile",
"qs": {"username": "={{ $json.username }}"},
"headers": {"Authorization": "Bearer {{ $env.SOCIAVAULT_API_KEY }}"}
}
},
{
"name": "Save to Database",
"type": "n8n-nodes-base.postgres",
"parameters": {
"operation": "insert",
"table": "profile_snapshots"
}
}
]
}
Error Handling & Alerts
Retry Logic
async function fetchWithRetry(fn, maxAttempts = 3) {
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
try {
return await fn();
} catch (error) {
if (attempt === maxAttempts) throw error;
const delay = Math.pow(2, attempt) * 1000; // Exponential backoff
console.log(`Attempt ${attempt} failed, retrying in ${delay}ms...`);
await new Promise(r => setTimeout(r, delay));
}
}
}
// Usage
const data = await fetchWithRetry(() =>
fetchProfile('tiktok', 'username')
);
Slack Alerts
async function sendAlert(message) {
await fetch(process.env.SLACK_WEBHOOK_URL, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text: message })
});
}
// In your collector
try {
await collectData();
} catch (error) {
await sendAlert(`⚠️ Collection failed: ${error.message}`);
}
Health Checks
// health.js
async function checkHealth() {
const checks = {
database: false,
api: false,
redis: false
};
// Check database
try {
db.prepare('SELECT 1').get();
checks.database = true;
} catch (e) {}
// Check API
try {
const response = await fetch(`${API_BASE}/health`, {
headers: { 'Authorization': `Bearer ${API_KEY}` }
});
checks.api = response.ok;
} catch (e) {}
// Check Redis
try {
await redis.ping();
checks.redis = true;
} catch (e) {}
return checks;
}
Scaling Tips
1. Batch Your Requests
// Instead of single requests
for (const username of usernames) {
await fetchProfile(username);
}
// Use batching if API supports it
const results = await fetch(`${API_BASE}/batch`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${API_KEY}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
requests: usernames.map(u => ({
endpoint: '/tiktok/profile',
params: { username: u }
}))
})
}).then(r => r.json());
2. Cache Intelligently
const NodeCache = require('node-cache');
const cache = new NodeCache({ stdTTL: 3600 }); // 1 hour
async function getCachedProfile(platform, username) {
const key = `${platform}:${username}`;
let data = cache.get(key);
if (data) return data;
data = await fetchProfile(platform, username);
cache.set(key, data);
return data;
}
3. Use Connection Pooling
const { Pool } = require('pg');
const pool = new Pool({
connectionString: process.env.DATABASE_URL,
max: 20,
idleTimeoutMillis: 30000
});
async function saveProfile(data) {
const client = await pool.connect();
try {
await client.query(
'INSERT INTO profiles (...) VALUES (...)',
[...]
);
} finally {
client.release();
}
}
Monitoring Dashboard
Track your collection health:
// metrics.js
const prometheus = require('prom-client');
const collectionsTotal = new prometheus.Counter({
name: 'social_collections_total',
help: 'Total number of profile collections',
labelNames: ['platform', 'status']
});
const collectionDuration = new prometheus.Histogram({
name: 'social_collection_duration_seconds',
help: 'Duration of collection operations',
labelNames: ['platform']
});
// In collector
const timer = collectionDuration.startTimer({ platform });
try {
await fetchProfile(platform, username);
collectionsTotal.inc({ platform, status: 'success' });
} catch (error) {
collectionsTotal.inc({ platform, status: 'error' });
} finally {
timer();
}
Ready to automate your data collection?
Get your API key at sociavault.com with 50 free credits.
Related:
Found this helpful?
Share it with others who might benefit
Ready to Try SociaVault?
Start extracting social media data with our powerful API. No credit card required.