Files
arnes/project/web/index/new/worker_bulk.php

256 lines
9.4 KiB
PHP
Executable File

<?php
// ============================================================
// worker_bulk.php — doble prompt: inglés y español independientes
// ============================================================
require_once __DIR__ . '/bootstrap.php';
date_default_timezone_set('Europe/Madrid');
mb_internal_encoding('UTF-8');
@ini_set('max_execution_time', '0');
@set_time_limit(0);
/* === CONFIG === */
$OPENAI_API_KEY = trim((string) legacy_config('openai.api_key', ''));
$OPENAI_MODEL = legacy_config('openai.model', 'gpt-4o-mini');
$OPENAI_ENDPOINT = legacy_config('openai.endpoint', 'https://api.openai.com/v1/chat/completions');
$LANG_ES = (int) legacy_config('store.language_es', 4);
$LANG_EN = (int) legacy_config('store.language_en', 1);
$STORE_NAME = legacy_config('store.name', 'Natural - Mercado de Vida');
$LOG_FILE = legacy_config('paths.worker_log', __DIR__ . '/logs/worker.log');
$PROMPT_EN_FILE = legacy_config('paths.prompt_en', __DIR__ . '/inc/prompt_en.md');
$PROMPT_ES_FILE = legacy_config('paths.prompt_es', __DIR__ . '/inc/prompt_es.md');
$BATCH_SIZE = (int) legacy_config('worker.batch_size', 20);
$MIN_HTML_LENGTH = (int) legacy_config('worker.min_html_length', 500);
$SHARD_TOTAL = 1;
$SHARD_INDEX = 0;
if (PHP_SAPI === 'cli' && isset($argv)) {
foreach ($argv as $arg) {
if (strpos($arg, '--shards=') === 0) {
$value = (int)substr($arg, 9);
if ($value > 0) $SHARD_TOTAL = min($value, 16); // evita saturar en exceso
} elseif (strpos($arg, '--shard=') === 0) {
$value = (int)substr($arg, 8);
if ($value >= 0) $SHARD_INDEX = $value;
}
}
}
if ($SHARD_INDEX >= $SHARD_TOTAL) {
$SHARD_INDEX = $SHARD_TOTAL - 1;
}
if ($SHARD_INDEX < 0) $SHARD_INDEX = 0;
/* === FUNCIONES === */
function log_msg($msg) {
global $LOG_FILE;
$time = date('Y-m-d H:i:s');
file_put_contents($LOG_FILE, "[$time] $msg\n", FILE_APPEND);
}
function obtener_respuesta($prompt, $key, $model, $max_tokens = 2000, $retries = 3) {
$endpoint = legacy_config('openai.endpoint', 'https://api.openai.com/v1/chat/completions');
if ($key === '' || strpos($key, 'CHANGE_ME_') === 0) {
log_msg('❌ Missing openai.api_key in config/local.php');
return '';
}
for ($i = 1; $i <= $retries; $i++) {
$ch = curl_init($endpoint);
$data = [
'model' => $model,
'messages' => [['role' => 'user', 'content' => $prompt]],
'temperature' => 0.6,
'max_tokens' => $max_tokens
];
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTPHEADER => [
'Content-Type: application/json',
'Authorization: Bearer ' . trim($key)
],
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => json_encode($data),
CURLOPT_TIMEOUT => 180
]);
$result = curl_exec($ch);
$err = curl_error($ch);
$http = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($err) { log_msg("⚠️ cURL error ($i/$retries): $err"); sleep(2); continue; }
if ($http !== 200) { log_msg("⚠️ HTTP $http on attempt $i"); sleep(3); continue; }
$json = json_decode($result, true);
$txt = $json['choices'][0]['message']['content'] ?? '';
if ($txt && mb_strlen(trim($txt)) > 50) return trim($txt);
log_msg("⚠️ Empty response attempt $i");
sleep(2);
}
log_msg("❌ No response after $retries attempts");
return '';
}
function limpiar_html($t) {
if (!$t) return '';
// 🔧 Quita fences Markdown (```html ... ```)
$t = preg_replace('/^```[a-zA-Z]*\s*/m', '', $t);
$t = preg_replace('/```$/m', '', $t);
$t = preg_replace('/```[\s\S]*?```/', '', $t);
// Quita h1/h2 pero conserva contenido
$t = preg_replace('/<\/?h1[^>]*>/i', '', $t);
$t = preg_replace('/<\/?h2[^>]*>/i', '', $t);
// Convierte div y section a <p>
$t = preg_replace('/<\s*div[^>]*>/i', '<p>', $t);
$t = preg_replace('/<\s*\/div\s*>/i', '</p>', $t);
$t = preg_replace('/<\s*section[^>]*>/i', '<p>', $t);
$t = preg_replace('/<\s*\/section\s*>/i', '</p>', $t);
// Quita scripts y estilos
$t = preg_replace('/<script.*?<\/script>/is', '', $t);
$t = preg_replace('/<style.*?<\/style>/is', '', $t);
// Quita markdown residual
$t = str_replace('```', '', $t);
// Limpieza de espacios
$t = preg_replace('/[ \t]+/', ' ', $t);
$t = preg_replace('/\n{2,}/', "\n", $t);
return trim($t);
}
/* Elimina emojis y normaliza espacios */
function sanitize_for_db($text) {
if ($text === null || $text === '') return '';
$text = preg_replace('/[\x{10000}-\x{10FFFF}]/u', '', $text);
$text = preg_replace('/\s+/', ' ', $text);
return trim($text);
}
function sentence_case($text) {
if (empty($text)) return '';
$text = trim(mb_strtolower($text, 'UTF-8'));
$first = mb_strtoupper(mb_substr($text, 0, 1, 'UTF-8'), 'UTF-8');
return $first . mb_substr($text, 1, null, 'UTF-8');
}
/* === DB === */
$db = legacy_new_mysqli();
if ($db->connect_errno) { log_msg('❌ DB: ' . $db->connect_error); exit; }
/* === Prompt base === */
if (!file_exists($PROMPT_EN_FILE) || !file_exists($PROMPT_ES_FILE)) {
log_msg("❌ Missing prompt files.");
exit;
}
$PROMPT_EN = file_get_contents($PROMPT_EN_FILE);
$PROMPT_ES = file_get_contents($PROMPT_ES_FILE);
if (trim($PROMPT_EN) === '' || trim($PROMPT_ES) === '') {
log_msg("❌ Empty prompt files.");
exit;
}
/* === Worker === */
$shardLabel = $SHARD_TOTAL > 1 ? " | shard {$SHARD_INDEX}/{$SHARD_TOTAL}" : '';
log_msg("🚀 Worker iniciado (modo doble prompt, batch={$BATCH_SIZE}{$shardLabel})");
$shardFilter = $SHARD_TOTAL > 1 ? " AND MOD(id, {$SHARD_TOTAL}) = {$SHARD_INDEX}" : '';
$q = $db->query("SELECT * FROM oc_product_queue WHERE processed=0{$shardFilter} ORDER BY id ASC LIMIT $BATCH_SIZE");
if (!$q || $q->num_rows === 0) { log_msg("⏸️ Cola vacía."); exit; }
while ($row = $q->fetch_assoc()) {
$pid = (int)$row['product_id'];
log_msg("🔄 Procesando producto $pid...");
$r = $db->query("
SELECT p.ean, d.name
FROM oc_product p
LEFT JOIN oc_product_description d ON p.product_id=d.product_id AND d.language_id=$LANG_ES
WHERE p.product_id=$pid
");
if (!$r || !$prod = $r->fetch_assoc()) {
log_msg("⚠️ Producto $pid no encontrado");
$db->query("UPDATE oc_product_queue SET processed=1, log='No encontrado' WHERE product_id=$pid");
continue;
}
$producto = $prod['name'];
$ean = $prod['ean'];
// === Prompts personalizados ===
$prompt_en = str_replace(['$producto', '$ean'], [$producto, $ean], $PROMPT_EN);
$prompt_es = str_replace(['$producto', '$ean'], [$producto, $ean], $PROMPT_ES);
// === Generar EN ===
$raw_en = obtener_respuesta($prompt_en, $OPENAI_API_KEY, $OPENAI_MODEL, 2200);
file_put_contents(__DIR__ . "/logs/raw_openai_en_$pid.txt", $raw_en);
$clean_en = limpiar_html($raw_en);
$html_en = sanitize_for_db($clean_en);
$meta_en = sanitize_for_db(mb_substr(strip_tags($clean_en), 0, 255, 'UTF-8'));
// === Generar ES ===
$raw_es = obtener_respuesta($prompt_es, $OPENAI_API_KEY, $OPENAI_MODEL, 2200);
file_put_contents(__DIR__ . "/logs/raw_openai_es_$pid.txt", $raw_es);
$clean_es = limpiar_html($raw_es);
$html_es = sanitize_for_db($clean_es);
$meta_es = sanitize_for_db(mb_substr(strip_tags($clean_es), 0, 255, 'UTF-8'));
// === Longitud de contenido ===
$len_en = mb_strlen($html_en);
$len_es = mb_strlen($html_es);
file_put_contents(__DIR__ . "/logs/html_debug_$pid.txt",
"EN ($len_en):\n$html_en\n\nES ($len_es):\n$html_es"
);
if ($len_en < $MIN_HTML_LENGTH || $len_es < $MIN_HTML_LENGTH) {
log_msg("❌ Texto demasiado corto (EN=$len_en / ES=$len_es) PID $pid");
$db->query("UPDATE oc_product_queue
SET processed=1, processed_at=NOW(), result_en=0, result_es=0, needs_verify=1, log='Texto corto'
WHERE product_id=$pid");
continue;
}
// === Guardar ===
$u_title_en = sentence_case("$producto | $STORE_NAME");
$u_h1_en = $producto;
$u_h2_en = sentence_case("benefits and properties of $producto");
$u_title_es = sentence_case("comprar $producto | $STORE_NAME");
$u_h1_es = $producto;
$u_h2_es = sentence_case("propiedades y beneficios de $producto");
$stmt = $db->prepare("UPDATE oc_product_description
SET description=?, meta_description=?, u_title=?, u_h1=?, u_h2=?
WHERE product_id=? AND language_id=?");
$stmt->bind_param('ssssssi', $html_en, $meta_en, $u_title_en, $u_h1_en, $u_h2_en, $pid, $LANG_EN);
if (!$stmt->execute()) log_msg("❌ Error EN $pid: " . $stmt->error);
$stmt->close();
$stmt = $db->prepare("UPDATE oc_product_description
SET description=?, meta_description=?, u_title=?, u_h1=?, u_h2=?
WHERE product_id=? AND language_id=?");
$stmt->bind_param('ssssssi', $html_es, $meta_es, $u_title_es, $u_h1_es, $u_h2_es, $pid, $LANG_ES);
if (!$stmt->execute()) log_msg("❌ Error ES $pid: " . $stmt->error);
$stmt->close();
$db->query("UPDATE oc_product_queue
SET processed=1, processed_at=NOW(), result_en=1, result_es=1, needs_verify=0, log='OK doble prompt'
WHERE product_id=$pid");
log_msg("$pid completado EN/ES (len EN=$len_en | ES=$len_es)");
usleep(100000);
}
log_msg("🏁 Worker finalizado.");