From 340a3dcce033923b2ca5326f957561072c9c1e11 Mon Sep 17 00:00:00 2001 From: Hardik Date: Sat, 16 May 2026 16:44:22 +0530 Subject: [PATCH] feat(gst-service): structured logging, request tracing, and per-session captcha refresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Logging (GstService): - JSON-structured log lines: { ts, level, msg, ...ctx } — one per line, machine-parseable by any log aggregator (datadog, loki, etc.) - LOG_LEVEL env var (DEBUG|INFO|WARN|ERROR, default INFO) — set DEBUG to see every captcha fetch, raw GST response body, and page console event - WARN and ERROR lines go to stderr; INFO/DEBUG go to stdout so process supervisors can separate them - Every log line carries relevant context: reqId, sessionId, gstin, ms, etc. - errCtx() helper extracts errName, errMsg, and first 6 stack frames from any thrown value — no more bare String(e) - elapsed() helper records wall-clock ms for every expensive step: browser launch, page navigation, captcha fetch, GST API call - Request/response middleware: every HTTP request logs method, path, reqId, status, and duration; status >= 500 logs at ERROR, >= 400 at WARN - Playwright page listeners: console errors/warnings, pageerror, requestfailed, and HTTP 4xx/5xx on GST portal endpoints - process.on(uncaughtException) and process.on(unhandledRejection) so unexpected crashes surface in logs instead of silently dying - Browser "disconnected" event logged; _browser reset so next request auto-relaunches without manual restart - SESSION_TTL_MS configurable via env (default 3 min) - closeSession() logs the reason (success / errorCode / exception / etc.) - GET /health now returns browserConnected, per-session captchaCount, expiresInMs, and lastUsedMsAgo for operational visibility Multiple captchas per session: - Session now holds captchas: CaptchaEntry[] (ordered oldest→newest) so every image fetched in a session is kept for traceability - GET /captcha/:sessionId — new endpoint that calls /services/captcha again within the SAME browser context (no page reload, ~200ms vs ~5s) and appends a new CaptchaEntry; resets TTL; returns totalCaptchas - POST /search on SWEB_9034 (wrong captcha) no longer closes the session — returns { canRefresh: true, sessionId } so the caller can hit GET /captcha/:sessionId for a fresh image and retry immediately - All other error paths (SWEB_9000, network error, no data) still close the session as before Next.js proxy (app/api/gst/captcha/route.ts): - GET /api/gst/captcha?refresh= proxies to the new GET /captcha/:sessionId endpoint on GstService - Plain GET /api/gst/captcha still creates a new session as before Co-Authored-By: Claude Sonnet 4.6 --- .../app/api/gst/captcha/route.ts | 30 +- GstService/src/index.ts | 471 +++++++++++++++--- 2 files changed, 418 insertions(+), 83 deletions(-) diff --git a/App/pelagia-portal/app/api/gst/captcha/route.ts b/App/pelagia-portal/app/api/gst/captcha/route.ts index 2ab6113..9d97a71 100644 --- a/App/pelagia-portal/app/api/gst/captcha/route.ts +++ b/App/pelagia-portal/app/api/gst/captcha/route.ts @@ -1,18 +1,34 @@ import { auth } from "@/auth"; -import { NextResponse } from "next/server"; +import { NextRequest, NextResponse } from "next/server"; const GST_SERVICE = process.env.GST_SERVICE_URL ?? "http://localhost:3003"; -/** Proxy: load GST portal page + fetch CAPTCHA → { sessionId, captchaBase64 } */ -export async function GET() { +/** + * GET /api/gst/captcha + * Create a new GST session and return the first captcha image. + * Response: { sessionId, captchaId, captchaBase64 } + * + * GET /api/gst/captcha?refresh= + * Refresh the captcha for an existing session (no page reload). + * Response: { captchaId, captchaBase64, totalCaptchas } + */ +export async function GET(req: NextRequest) { const session = await auth(); if (!session?.user) return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + const refreshId = req.nextUrl.searchParams.get("refresh"); + try { - const res = await fetch(`${GST_SERVICE}/captcha`, { cache: "no-store" }); - const data = await res.json(); - return NextResponse.json(data, { status: res.ok ? 200 : 502 }); + const upstream = refreshId + ? await fetch(`${GST_SERVICE}/captcha/${encodeURIComponent(refreshId)}`, { cache: "no-store" }) + : await fetch(`${GST_SERVICE}/captcha`, { cache: "no-store" }); + + const data = await upstream.json(); + return NextResponse.json(data, { status: upstream.ok ? 200 : upstream.status }); } catch (e) { - return NextResponse.json({ error: `GST service unavailable: ${String(e)}` }, { status: 502 }); + return NextResponse.json( + { error: `GST service unavailable: ${String(e)}` }, + { status: 502 } + ); } } diff --git a/GstService/src/index.ts b/GstService/src/index.ts index 7229922..6be5e2a 100644 --- a/GstService/src/index.ts +++ b/GstService/src/index.ts @@ -1,48 +1,208 @@ import express from "express"; import { chromium, type Browser, type BrowserContext, type Page } from "playwright"; -const PORT = Number(process.env.PORT ?? 3003); -const SESSION_TTL_MS = 3 * 60 * 1000; // 3 min +// ── Config ──────────────────────────────────────────────────────────────────── + +const PORT = Number(process.env.PORT ?? 3003); +const SESSION_TTL_MS = Number(process.env.SESSION_TTL_MS ?? 3 * 60 * 1000); // 3 min default +const LOG_LEVEL = (process.env.LOG_LEVEL ?? "INFO") as LogLevel; + +// ── Structured logger ───────────────────────────────────────────────────────── + +type LogLevel = "DEBUG" | "INFO" | "WARN" | "ERROR"; +const LEVEL_RANK: Record = { DEBUG: 0, INFO: 1, WARN: 2, ERROR: 3 }; + +function log(level: LogLevel, msg: string, ctx?: Record): void { + if (LEVEL_RANK[level] < LEVEL_RANK[LOG_LEVEL]) return; + const entry: Record = { ts: new Date().toISOString(), level, msg, ...ctx }; + const line = JSON.stringify(entry); + // Errors and warnings go to stderr so process supervisors can separate them + if (level === "ERROR" || level === "WARN") process.stderr.write(line + "\n"); + else process.stdout.write(line + "\n"); +} + +const logger = { + debug: (msg: string, ctx?: Record) => log("DEBUG", msg, ctx), + info: (msg: string, ctx?: Record) => log("INFO", msg, ctx), + warn: (msg: string, ctx?: Record) => log("WARN", msg, ctx), + error: (msg: string, ctx?: Record) => log("ERROR", msg, ctx), +}; + +function elapsed(startMs: number): number { return Date.now() - startMs; } + +/** Extract loggable fields from any thrown value. */ +function errCtx(e: unknown): Record { + if (e instanceof Error) { + return { + errName: e.name, + errMsg: e.message, + // First 6 frames — enough for diagnosis without flooding logs + stack: e.stack?.split("\n").slice(0, 6).map((l) => l.trim()), + }; + } + return { err: String(e) }; +} + +// ── Process-level resilience ────────────────────────────────────────────────── + +process.on("uncaughtException", (e) => logger.error("Uncaught exception", errCtx(e))); +process.on("unhandledRejection", (reason) => logger.error("Unhandled promise rejection", errCtx(reason))); // ── Singleton browser ───────────────────────────────────────────────────────── let _browser: Browser | null = null; async function getBrowser(): Promise { - if (!_browser || !_browser.isConnected()) { - console.log("[gst-service] Launching Chromium…"); - _browser = await chromium.launch({ - headless: true, - args: ["--no-sandbox", "--disable-setuid-sandbox"], - }); - console.log("[gst-service] Chromium ready."); - } + if (_browser?.isConnected()) return _browser; + + const t = Date.now(); + logger.info("Launching Chromium"); + _browser = await chromium.launch({ + headless: true, + args: ["--no-sandbox", "--disable-setuid-sandbox"], + }); + _browser.on("disconnected", () => { + logger.warn("Browser disconnected — will relaunch on next request"); + _browser = null; + }); + logger.info("Chromium ready", { ms: elapsed(t) }); return _browser; } // ── Session store ───────────────────────────────────────────────────────────── +/** One captcha image fetched within a session. */ +type CaptchaEntry = { + captchaId: string; + b64: string; + fetchedAt: number; +}; + +/** + * A browser context + page kept alive for one GST lookup. + * Multiple captcha images can be fetched into `captchas` without reloading + * the page — useful when the first image is unreadable or the user types it + * wrong and wants to retry without paying the cost of a full page reload. + */ type Session = { - ctx: BrowserContext; - page: Page; - captchaB64: string; - expires: number; + sessionId: string; + ctx: BrowserContext; + page: Page; + captchas: CaptchaEntry[]; // ordered oldest→newest + createdAt: number; + lastUsedAt: number; + expires: number; }; const sessions = new Map(); -function makeId() { +function makeId(): string { return Math.random().toString(36).slice(2) + Date.now().toString(36); } -function pruneExpired() { +/** Close + remove all sessions whose TTL has passed. */ +function pruneExpired(): void { const now = Date.now(); + let pruned = 0; for (const [id, s] of sessions) { if (s.expires < now) { - s.ctx.close().catch(() => {}); + s.ctx.close().catch((e) => + logger.warn("Error closing expired ctx", { sessionId: id, ...errCtx(e) }) + ); sessions.delete(id); + pruned++; } } + if (pruned > 0) logger.info("Pruned expired sessions", { pruned, remaining: sessions.size }); +} + +/** Look up a live session; returns null and cleans up if missing or expired. */ +function getSession(sessionId: string): Session | null { + const s = sessions.get(sessionId); + if (!s) return null; + if (s.expires < Date.now()) { + sessions.delete(sessionId); + s.ctx.close().catch((e) => + logger.warn("Error closing expired ctx on access", { sessionId, ...errCtx(e) }) + ); + logger.info("Session expired on access", { sessionId }); + return null; + } + s.lastUsedAt = Date.now(); + return s; +} + +/** Close the browser context and remove from the map. */ +function closeSession(sessionId: string, reason = "normal"): void { + const s = sessions.get(sessionId); + if (!s) return; + sessions.delete(sessionId); + logger.info("Session closed", { sessionId, reason, captchaCount: s.captchas.length }); + s.ctx.close().catch((e) => + logger.warn("Error closing ctx", { sessionId, ...errCtx(e) }) + ); +} + +// ── Playwright page helpers ─────────────────────────────────────────────────── + +/** Wire up page events so Playwright activity surfaces in structured logs. */ +function attachPageListeners(page: Page, sessionId: string): void { + page.on("console", (msg) => { + const type = msg.type(); + if (type === "error" || type === "warning") { + logger.debug("Page console", { sessionId, type, text: msg.text() }); + } + }); + page.on("pageerror", (err) => { + logger.warn("Uncaught JS error on page", { sessionId, ...errCtx(err) }); + }); + page.on("requestfailed", (req) => { + logger.warn("Network request failed", { + sessionId, + url: req.url(), + failure: req.failure()?.errorText, + }); + }); + page.on("response", (resp) => { + const url = resp.url(); + const status = resp.status(); + if ( + status >= 400 && + (url.includes("captcha") || url.includes("searchtp") || url.includes("/api/search")) + ) { + logger.warn("GST portal returned HTTP error on key endpoint", { sessionId, url, status }); + } + }); +} + +/** Call /services/captcha from within the page context to get a fresh image. */ +async function fetchCaptchaFromPage(page: Page, sessionId: string): Promise { + const t = Date.now(); + logger.debug("Fetching captcha image from GST portal", { sessionId }); + + const b64: string = await page.evaluate(() => + fetch("/services/captcha", { headers: { Accept: "image/png,image/*" } }) + .then((r) => { + if (!r.ok) throw new Error(`Captcha endpoint returned HTTP ${r.status}`); + return r.blob(); + }) + .then( + (blob) => + new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = () => resolve((reader.result as string).split(",")[1]); + reader.onerror = () => reject(new Error("FileReader failed reading captcha blob")); + reader.readAsDataURL(blob); + }) + ) + ); + + if (!b64 || b64.length < 100) { + throw new Error(`Captcha response looks invalid (base64 length=${b64?.length ?? 0})`); + } + + logger.debug("Captcha image ready", { sessionId, b64Len: b64.length, ms: elapsed(t) }); + return b64; } // ── Express app ─────────────────────────────────────────────────────────────── @@ -50,133 +210,292 @@ function pruneExpired() { const app = express(); app.use(express.json()); -// Allow the Pelagia app to call us (CORS) +// CORS — allow Pelagia portal to call us from any origin app.use((_req, res, next) => { res.setHeader("Access-Control-Allow-Origin", "*"); res.setHeader("Access-Control-Allow-Headers", "Content-Type"); next(); }); -app.get("/health", (_req, res) => res.json({ ok: true })); +// ── Per-request ID + response logging ──────────────────────────────────────── +type TrackedReq = express.Request & { reqId: string; startMs: number }; +app.use((req, res, next) => { + const r = req as TrackedReq; + r.reqId = makeId(); + r.startMs = Date.now(); + + res.on("finish", () => { + const level: LogLevel = + res.statusCode >= 500 ? "ERROR" : + res.statusCode >= 400 ? "WARN" : "INFO"; + log(level, `${req.method} ${req.path}`, { + reqId: r.reqId, + status: res.statusCode, + ms: elapsed(r.startMs), + sessions: sessions.size, + }); + }); + + next(); +}); + +// ── GET /health ─────────────────────────────────────────────────────────────── +app.get("/health", (_req, res) => { + const now = Date.now(); + res.json({ + ok: true, + browserConnected: _browser?.isConnected() ?? false, + sessionCount: sessions.size, + activeSessions: [...sessions.values()].map((s) => ({ + sessionId: s.sessionId, + captchaCount: s.captchas.length, + expiresInMs: s.expires - now, + lastUsedMsAgo: now - s.lastUsedAt, + })), + }); +}); + +// ── GET /captcha — create new session ───────────────────────────────────────── /** - * GET /captcha - * Loads the GST search page, fetches the CAPTCHA image. - * Returns: { sessionId, captchaBase64 } + * Loads the GST search page in a fresh browser context, fetches the first + * captcha image, and returns a session that can be reused for retries. + * + * Response: { sessionId, captchaId, captchaBase64 } */ -app.get("/captcha", async (_req, res) => { +app.get("/captcha", async (req, res) => { pruneExpired(); + const { reqId } = req as TrackedReq; + const t = Date.now(); + let ctx: BrowserContext | undefined; + try { const browser = await getBrowser(); - const ctx = await browser.newContext({ + logger.debug("Creating browser context", { reqId }); + + ctx = await browser.newContext({ userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", viewport: { width: 1280, height: 900 }, }); - const page = await ctx.newPage(); + const page = await ctx.newPage(); + const sessionId = makeId(); + attachPageListeners(page, sessionId); + // Navigate — this establishes the portal's session cookies + logger.info("Navigating to GST search page", { reqId, sessionId }); + const navT = Date.now(); await page.goto("https://services.gst.gov.in/services/searchtp", { waitUntil: "networkidle", - timeout: 30000, + timeout: 30_000, }); + logger.info("GST portal loaded", { reqId, sessionId, ms: elapsed(navT) }); - const captchaB64: string = await page.evaluate(() => - fetch("/services/captcha", { headers: { Accept: "image/png,image/*" } }) - .then((r) => r.blob()) - .then( - (blob) => - new Promise((resolve, reject) => { - const reader = new FileReader(); - reader.onload = () => - resolve((reader.result as string).split(",")[1]); - reader.onerror = reject; - reader.readAsDataURL(blob); - }) - ) - ); + // First captcha + const b64 = await fetchCaptchaFromPage(page, sessionId); + const captchaId = makeId(); - if (!captchaB64) throw new Error("Empty CAPTCHA response"); - - const sessionId = makeId(); sessions.set(sessionId, { + sessionId, ctx, page, - captchaB64, - expires: Date.now() + SESSION_TTL_MS, + captchas: [{ captchaId, b64, fetchedAt: Date.now() }], + createdAt: Date.now(), + lastUsedAt: Date.now(), + expires: Date.now() + SESSION_TTL_MS, }); - console.log(`[gst-service] Session ${sessionId} created`); - res.json({ sessionId, captchaBase64: captchaB64 }); + logger.info("Session created", { reqId, sessionId, captchaId, totalMs: elapsed(t) }); + return res.json({ sessionId, captchaId, captchaBase64: b64 }); + } catch (e) { - console.error("[gst-service] /captcha error:", e); - res.status(502).json({ error: String(e) }); + logger.error("GET /captcha failed", { reqId, totalMs: elapsed(t), ...errCtx(e) }); + ctx?.close().catch(() => {}); + return res.status(502).json({ error: "Failed to fetch CAPTCHA from GST portal. Please try again." }); } }); +// ── GET /captcha/:sessionId — refresh captcha within existing session ────────── +/** + * Fetches a new captcha image using the SAME browser context and page — + * no page reload. The GST portal's /services/captcha endpoint issues a fresh + * image (and updates CaptchaCookie) on each call. + * + * Use this after a SWEB_9034 (wrong captcha) response so the user can retry + * without the latency of a new page load. + * + * Response: { captchaId, captchaBase64, totalCaptchas } + */ +app.get("/captcha/:sessionId", async (req, res) => { + const { sessionId } = req.params; + const { reqId } = req as TrackedReq; + const t = Date.now(); + + const session = getSession(sessionId); + if (!session) { + logger.warn("Captcha refresh: session not found or expired", { reqId, sessionId }); + return res.status(410).json({ error: "Session expired — please start a new lookup." }); + } + + try { + const b64 = await fetchCaptchaFromPage(session.page, sessionId); + const captchaId = makeId(); + + session.captchas.push({ captchaId, b64, fetchedAt: Date.now() }); + session.expires = Date.now() + SESSION_TTL_MS; // reset TTL on activity + + logger.info("Captcha refreshed", { + reqId, + sessionId, + captchaId, + totalCaptchas: session.captchas.length, + ms: elapsed(t), + }); + return res.json({ captchaId, captchaBase64: b64, totalCaptchas: session.captchas.length }); + + } catch (e) { + logger.error("GET /captcha/:sessionId failed", { reqId, sessionId, ms: elapsed(t), ...errCtx(e) }); + return res.status(502).json({ error: "Failed to refresh CAPTCHA." }); + } +}); + +// ── POST /search ─────────────────────────────────────────────────────────────── /** - * POST /search * Body: { sessionId, gstin, captcha } - * Returns taxpayer data or { error } + * + * Success: closes session, returns taxpayer data. + * SWEB_9034: keeps session alive, returns { canRefresh: true, sessionId } + * — caller should GET /captcha/:sessionId for a fresh image. + * Other error: closes session, returns { error }. */ app.post("/search", async (req, res) => { const { sessionId, gstin, captcha } = req.body ?? {}; + const { reqId } = req as TrackedReq; + const t = Date.now(); + if (!sessionId || !gstin || !captcha) { return res.status(400).json({ error: "sessionId, gstin and captcha are required" }); } - const session = sessions.get(sessionId); - if (!session || session.expires < Date.now()) { - sessions.delete(sessionId); + const session = getSession(sessionId); + if (!session) { + logger.warn("Search: session not found or expired", { reqId, sessionId, gstin }); return res.status(410).json({ error: "Session expired — please fetch a new CAPTCHA." }); } + logger.info("Submitting GST search", { + reqId, + sessionId, + gstin, + captchaLen: captcha.length, + captchaCount: session.captchas.length, + }); + try { + const searchT = Date.now(); const raw: Record = await session.page.evaluate( ([g, c]: [string, string]) => fetch("/services/api/search/tp", { - method: "POST", + method: "POST", headers: { - Accept: "application/json, text/plain", + Accept: "application/json, text/plain", "Content-Type": "application/json;charset=UTF-8", }, body: JSON.stringify({ gstin: g, captcha: c }), }) - .then((r) => r.json()) + .then(async (r) => r.json().catch(() => ({ error: `HTTP ${r.status}` }))) .catch((e: Error) => ({ error: e.message })), [gstin, captcha] as [string, string] ); + logger.debug("GST portal raw response", { + reqId, sessionId, gstin, ms: elapsed(searchT), raw, + }); - // Always close session after use - session.ctx.close().catch(() => {}); - sessions.delete(sessionId); - console.log(`[gst-service] Session ${sessionId} closed`); + // Wrong captcha — keep session alive so caller can refresh + if (raw.errorCode === "SWEB_9034") { + logger.warn("Wrong captcha — session kept alive for refresh", { + reqId, sessionId, gstin, + captchaCount: session.captchas.length, + ms: elapsed(t), + }); + return res.status(422).json({ + error: "Wrong CAPTCHA — please try again.", + canRefresh: true, + sessionId, + }); + } - if (raw.error) return res.status(502).json({ error: raw.error }); - if (raw.errorCode === "SWEB_9034") return res.status(422).json({ error: "Wrong CAPTCHA — please try again." }); - if (raw.errorCode) return res.status(422).json({ error: `GST portal error: ${raw.errorCode}` }); - if (!raw.gstin) return res.status(422).json({ error: "No data found for that GSTIN." }); + // Portal session/auth expired + if (raw.errorCode === "SWEB_9000") { + logger.warn("GST portal session expired (SWEB_9000)", { reqId, sessionId, gstin, ms: elapsed(t) }); + closeSession(sessionId, "SWEB_9000"); + return res.status(502).json({ error: "GST portal session expired. Please start a new lookup." }); + } - // Parse address / pincode - const pradr = (raw.pradr as Record)?.adr as string ?? ""; - const pincodeMatch = pradr.match(/\b(\d{6})\b/); + // Other portal error codes + if (raw.errorCode) { + logger.warn("GST portal error code", { + reqId, sessionId, gstin, errorCode: raw.errorCode, ms: elapsed(t), + }); + closeSession(sessionId, `errorCode:${raw.errorCode}`); + return res.status(422).json({ error: `GST portal error: ${raw.errorCode}` }); + } - res.json({ + // Network / fetch error from page.evaluate + if (raw.error) { + logger.error("GST search network error", { + reqId, sessionId, gstin, error: raw.error, ms: elapsed(t), + }); + closeSession(sessionId, "network-error"); + return res.status(502).json({ error: String(raw.error) }); + } + + // Empty / unexpected response + if (!raw.gstin) { + logger.warn("No GSTIN in GST portal response", { reqId, sessionId, gstin, ms: elapsed(t) }); + closeSession(sessionId, "no-data"); + return res.status(422).json({ error: "No taxpayer data found for that GSTIN." }); + } + + // Success + closeSession(sessionId, "success"); + + const pradr = (raw.pradr as Record)?.adr as string ?? ""; + const pincode = pradr.match(/\b(\d{6})\b/)?.[1] ?? ""; + const state = String(raw.stj ?? "").split(",")[0].replace(/^State\s*-\s*/i, ""); + + const result = { legalName: raw.lgnm ?? "", tradeName: raw.tradeNam ?? raw.lgnm ?? "", address: pradr, - state: String(raw.stj ?? "").split(",")[0].replace(/^State\s*-\s*/i, ""), - pincode: pincodeMatch?.[1] ?? "", + state, + pincode, gstin: raw.gstin, - status: raw.sts ?? "", - businessType: raw.ctb ?? raw.dty ?? "", + status: raw.sts ?? "", + businessType: raw.ctb ?? raw.dty ?? "", registrationDate: raw.rgdt ?? "", + }; + + logger.info("GST search successful", { + reqId, sessionId, gstin, legalName: result.legalName, totalMs: elapsed(t), }); + return res.json(result); + } catch (e) { - console.error("[gst-service] /search error:", e); - res.status(500).json({ error: String(e) }); + logger.error("POST /search failed unexpectedly", { + reqId, sessionId, gstin, ms: elapsed(t), ...errCtx(e), + }); + closeSession(sessionId, "exception"); + return res.status(500).json({ error: "Internal error during GST lookup." }); } }); +// ── Start ───────────────────────────────────────────────────────────────────── + app.listen(PORT, () => { - console.log(`[gst-service] Listening on http://localhost:${PORT}`); + logger.info("GstService listening", { + port: PORT, + sessionTtlMs: SESSION_TTL_MS, + logLevel: LOG_LEVEL, + }); });