pelagia-portal/GstService/src/index.ts

501 lines
18 KiB
TypeScript

import express from "express";
import { chromium, type Browser, type BrowserContext, type Page } from "playwright";
// ── Config ────────────────────────────────────────────────────────────────────
const PORT = Number(process.env.PORT ?? 3003);
const SESSION_TTL_MS = Number(process.env.SESSION_TTL_MS ?? 3 * 60 * 1000); // 3 min default
const LOG_LEVEL = (process.env.LOG_LEVEL ?? "INFO") as LogLevel;
// ── Structured logger ─────────────────────────────────────────────────────────
type LogLevel = "DEBUG" | "INFO" | "WARN" | "ERROR";
const LEVEL_RANK: Record<LogLevel, number> = { DEBUG: 0, INFO: 1, WARN: 2, ERROR: 3 };
function log(level: LogLevel, msg: string, ctx?: Record<string, unknown>): void {
if (LEVEL_RANK[level] < LEVEL_RANK[LOG_LEVEL]) return;
const entry: Record<string, unknown> = { ts: new Date().toISOString(), level, msg, ...ctx };
const line = JSON.stringify(entry);
// Errors and warnings go to stderr so process supervisors can separate them
if (level === "ERROR" || level === "WARN") process.stderr.write(line + "\n");
else process.stdout.write(line + "\n");
}
const logger = {
debug: (msg: string, ctx?: Record<string, unknown>) => log("DEBUG", msg, ctx),
info: (msg: string, ctx?: Record<string, unknown>) => log("INFO", msg, ctx),
warn: (msg: string, ctx?: Record<string, unknown>) => log("WARN", msg, ctx),
error: (msg: string, ctx?: Record<string, unknown>) => log("ERROR", msg, ctx),
};
function elapsed(startMs: number): number { return Date.now() - startMs; }
/** Extract loggable fields from any thrown value. */
function errCtx(e: unknown): Record<string, unknown> {
if (e instanceof Error) {
return {
errName: e.name,
errMsg: e.message,
// First 6 frames — enough for diagnosis without flooding logs
stack: e.stack?.split("\n").slice(0, 6).map((l) => l.trim()),
};
}
return { err: String(e) };
}
// ── Process-level resilience ──────────────────────────────────────────────────
process.on("uncaughtException", (e) => logger.error("Uncaught exception", errCtx(e)));
process.on("unhandledRejection", (reason) => logger.error("Unhandled promise rejection", errCtx(reason)));
// ── Singleton browser ─────────────────────────────────────────────────────────
let _browser: Browser | null = null;
async function getBrowser(): Promise<Browser> {
if (_browser?.isConnected()) return _browser;
const t = Date.now();
logger.info("Launching Chromium");
_browser = await chromium.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
_browser.on("disconnected", () => {
logger.warn("Browser disconnected — will relaunch on next request");
_browser = null;
});
logger.info("Chromium ready", { ms: elapsed(t) });
return _browser;
}
// ── Session store ─────────────────────────────────────────────────────────────
/** One captcha image fetched within a session. */
type CaptchaEntry = {
captchaId: string;
b64: string;
fetchedAt: number;
};
/**
* A browser context + page kept alive for one GST lookup.
* Multiple captcha images can be fetched into `captchas` without reloading
* the page — useful when the first image is unreadable or the user types it
* wrong and wants to retry without paying the cost of a full page reload.
*/
type Session = {
sessionId: string;
ctx: BrowserContext;
page: Page;
captchas: CaptchaEntry[]; // ordered oldest→newest
createdAt: number;
lastUsedAt: number;
expires: number;
};
const sessions = new Map<string, Session>();
function makeId(): string {
return Math.random().toString(36).slice(2) + Date.now().toString(36);
}
/** Close + remove all sessions whose TTL has passed. */
function pruneExpired(): void {
const now = Date.now();
let pruned = 0;
for (const [id, s] of sessions) {
if (s.expires < now) {
s.ctx.close().catch((e) =>
logger.warn("Error closing expired ctx", { sessionId: id, ...errCtx(e) })
);
sessions.delete(id);
pruned++;
}
}
if (pruned > 0) logger.info("Pruned expired sessions", { pruned, remaining: sessions.size });
}
/** Look up a live session; returns null and cleans up if missing or expired. */
function getSession(sessionId: string): Session | null {
const s = sessions.get(sessionId);
if (!s) return null;
if (s.expires < Date.now()) {
sessions.delete(sessionId);
s.ctx.close().catch((e) =>
logger.warn("Error closing expired ctx on access", { sessionId, ...errCtx(e) })
);
logger.info("Session expired on access", { sessionId });
return null;
}
s.lastUsedAt = Date.now();
return s;
}
/** Close the browser context and remove from the map. */
function closeSession(sessionId: string, reason = "normal"): void {
const s = sessions.get(sessionId);
if (!s) return;
sessions.delete(sessionId);
logger.info("Session closed", { sessionId, reason, captchaCount: s.captchas.length });
s.ctx.close().catch((e) =>
logger.warn("Error closing ctx", { sessionId, ...errCtx(e) })
);
}
// ── Playwright page helpers ───────────────────────────────────────────────────
/** Wire up page events so Playwright activity surfaces in structured logs. */
function attachPageListeners(page: Page, sessionId: string): void {
page.on("console", (msg) => {
const type = msg.type();
if (type === "error" || type === "warning") {
logger.debug("Page console", { sessionId, type, text: msg.text() });
}
});
page.on("pageerror", (err) => {
logger.warn("Uncaught JS error on page", { sessionId, ...errCtx(err) });
});
page.on("requestfailed", (req) => {
logger.warn("Network request failed", {
sessionId,
url: req.url(),
failure: req.failure()?.errorText,
});
});
page.on("response", (resp) => {
const url = resp.url();
const status = resp.status();
if (
status >= 400 &&
(url.includes("captcha") || url.includes("searchtp") || url.includes("/api/search"))
) {
logger.warn("GST portal returned HTTP error on key endpoint", { sessionId, url, status });
}
});
}
/** Call /services/captcha from within the page context to get a fresh image. */
async function fetchCaptchaFromPage(page: Page, sessionId: string): Promise<string> {
const t = Date.now();
logger.debug("Fetching captcha image from GST portal", { sessionId });
const b64: string = await page.evaluate(() =>
fetch("/services/captcha", { headers: { Accept: "image/png,image/*" } })
.then((r) => {
if (!r.ok) throw new Error(`Captcha endpoint returned HTTP ${r.status}`);
return r.blob();
})
.then(
(blob) =>
new Promise<string>((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => resolve((reader.result as string).split(",")[1]);
reader.onerror = () => reject(new Error("FileReader failed reading captcha blob"));
reader.readAsDataURL(blob);
})
)
);
if (!b64 || b64.length < 100) {
throw new Error(`Captcha response looks invalid (base64 length=${b64?.length ?? 0})`);
}
logger.debug("Captcha image ready", { sessionId, b64Len: b64.length, ms: elapsed(t) });
return b64;
}
// ── Express app ───────────────────────────────────────────────────────────────
const app = express();
app.use(express.json());
// CORS — allow Pelagia portal to call us from any origin
app.use((_req, res, next) => {
res.setHeader("Access-Control-Allow-Origin", "*");
res.setHeader("Access-Control-Allow-Headers", "Content-Type");
next();
});
// ── Per-request ID + response logging ────────────────────────────────────────
type TrackedReq = express.Request & { reqId: string; startMs: number };
app.use((req, res, next) => {
const r = req as TrackedReq;
r.reqId = makeId();
r.startMs = Date.now();
res.on("finish", () => {
const level: LogLevel =
res.statusCode >= 500 ? "ERROR" :
res.statusCode >= 400 ? "WARN" : "INFO";
log(level, `${req.method} ${req.path}`, {
reqId: r.reqId,
status: res.statusCode,
ms: elapsed(r.startMs),
sessions: sessions.size,
});
});
next();
});
// ── GET /health ───────────────────────────────────────────────────────────────
app.get("/health", (_req, res) => {
const now = Date.now();
res.json({
ok: true,
browserConnected: _browser?.isConnected() ?? false,
sessionCount: sessions.size,
activeSessions: [...sessions.values()].map((s) => ({
sessionId: s.sessionId,
captchaCount: s.captchas.length,
expiresInMs: s.expires - now,
lastUsedMsAgo: now - s.lastUsedAt,
})),
});
});
// ── GET /captcha — create new session ─────────────────────────────────────────
/**
* Loads the GST search page in a fresh browser context, fetches the first
* captcha image, and returns a session that can be reused for retries.
*
* Response: { sessionId, captchaId, captchaBase64 }
*/
app.get("/captcha", async (req, res) => {
pruneExpired();
const { reqId } = req as TrackedReq;
const t = Date.now();
let ctx: BrowserContext | undefined;
try {
const browser = await getBrowser();
logger.debug("Creating browser context", { reqId });
ctx = await browser.newContext({
userAgent:
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
viewport: { width: 1280, height: 900 },
});
const page = await ctx.newPage();
const sessionId = makeId();
attachPageListeners(page, sessionId);
// Navigate — this establishes the portal's session cookies
logger.info("Navigating to GST search page", { reqId, sessionId });
const navT = Date.now();
await page.goto("https://services.gst.gov.in/services/searchtp", {
waitUntil: "networkidle",
timeout: 30_000,
});
logger.info("GST portal loaded", { reqId, sessionId, ms: elapsed(navT) });
// First captcha
const b64 = await fetchCaptchaFromPage(page, sessionId);
const captchaId = makeId();
sessions.set(sessionId, {
sessionId,
ctx,
page,
captchas: [{ captchaId, b64, fetchedAt: Date.now() }],
createdAt: Date.now(),
lastUsedAt: Date.now(),
expires: Date.now() + SESSION_TTL_MS,
});
logger.info("Session created", { reqId, sessionId, captchaId, totalMs: elapsed(t) });
return res.json({ sessionId, captchaId, captchaBase64: b64 });
} catch (e) {
logger.error("GET /captcha failed", { reqId, totalMs: elapsed(t), ...errCtx(e) });
ctx?.close().catch(() => {});
return res.status(502).json({ error: "Failed to fetch CAPTCHA from GST portal. Please try again." });
}
});
// ── GET /captcha/:sessionId — refresh captcha within existing session ──────────
/**
* Fetches a new captcha image using the SAME browser context and page —
* no page reload. The GST portal's /services/captcha endpoint issues a fresh
* image (and updates CaptchaCookie) on each call.
*
* Use this after a SWEB_9034 (wrong captcha) response so the user can retry
* without the latency of a new page load.
*
* Response: { captchaId, captchaBase64, totalCaptchas }
*/
app.get("/captcha/:sessionId", async (req, res) => {
const { sessionId } = req.params;
const { reqId } = req as unknown as TrackedReq;
const t = Date.now();
const session = getSession(sessionId);
if (!session) {
logger.warn("Captcha refresh: session not found or expired", { reqId, sessionId });
return res.status(410).json({ error: "Session expired — please start a new lookup." });
}
try {
const b64 = await fetchCaptchaFromPage(session.page, sessionId);
const captchaId = makeId();
session.captchas.push({ captchaId, b64, fetchedAt: Date.now() });
session.expires = Date.now() + SESSION_TTL_MS; // reset TTL on activity
logger.info("Captcha refreshed", {
reqId,
sessionId,
captchaId,
totalCaptchas: session.captchas.length,
ms: elapsed(t),
});
return res.json({ captchaId, captchaBase64: b64, totalCaptchas: session.captchas.length });
} catch (e) {
logger.error("GET /captcha/:sessionId failed", { reqId, sessionId, ms: elapsed(t), ...errCtx(e) });
return res.status(502).json({ error: "Failed to refresh CAPTCHA." });
}
});
// ── POST /search ───────────────────────────────────────────────────────────────
/**
* Body: { sessionId, gstin, captcha }
*
* Success: closes session, returns taxpayer data.
* SWEB_9034: keeps session alive, returns { canRefresh: true, sessionId }
* — caller should GET /captcha/:sessionId for a fresh image.
* Other error: closes session, returns { error }.
*/
app.post("/search", async (req, res) => {
const { sessionId, gstin, captcha } = req.body ?? {};
const { reqId } = req as TrackedReq;
const t = Date.now();
if (!sessionId || !gstin || !captcha) {
return res.status(400).json({ error: "sessionId, gstin and captcha are required" });
}
const session = getSession(sessionId);
if (!session) {
logger.warn("Search: session not found or expired", { reqId, sessionId, gstin });
return res.status(410).json({ error: "Session expired — please fetch a new CAPTCHA." });
}
logger.info("Submitting GST search", {
reqId,
sessionId,
gstin,
captchaLen: captcha.length,
captchaCount: session.captchas.length,
});
try {
const searchT = Date.now();
const raw: Record<string, unknown> = await session.page.evaluate(
([g, c]: [string, string]) =>
fetch("/services/api/search/tp", {
method: "POST",
headers: {
Accept: "application/json, text/plain",
"Content-Type": "application/json;charset=UTF-8",
},
body: JSON.stringify({ gstin: g, captcha: c }),
})
.then(async (r) => r.json().catch(() => ({ error: `HTTP ${r.status}` })))
.catch((e: Error) => ({ error: e.message })),
[gstin, captcha] as [string, string]
);
logger.debug("GST portal raw response", {
reqId, sessionId, gstin, ms: elapsed(searchT), raw,
});
// Wrong captcha — keep session alive so caller can refresh
if (raw.errorCode === "SWEB_9034") {
logger.warn("Wrong captcha — session kept alive for refresh", {
reqId, sessionId, gstin,
captchaCount: session.captchas.length,
ms: elapsed(t),
});
return res.status(422).json({
error: "Wrong CAPTCHA — please try again.",
canRefresh: true,
sessionId,
});
}
// Portal session/auth expired
if (raw.errorCode === "SWEB_9000") {
logger.warn("GST portal session expired (SWEB_9000)", { reqId, sessionId, gstin, ms: elapsed(t) });
closeSession(sessionId, "SWEB_9000");
return res.status(502).json({ error: "GST portal session expired. Please start a new lookup." });
}
// Other portal error codes
if (raw.errorCode) {
logger.warn("GST portal error code", {
reqId, sessionId, gstin, errorCode: raw.errorCode, ms: elapsed(t),
});
closeSession(sessionId, `errorCode:${raw.errorCode}`);
return res.status(422).json({ error: `GST portal error: ${raw.errorCode}` });
}
// Network / fetch error from page.evaluate
if (raw.error) {
logger.error("GST search network error", {
reqId, sessionId, gstin, error: raw.error, ms: elapsed(t),
});
closeSession(sessionId, "network-error");
return res.status(502).json({ error: String(raw.error) });
}
// Empty / unexpected response
if (!raw.gstin) {
logger.warn("No GSTIN in GST portal response", { reqId, sessionId, gstin, ms: elapsed(t) });
closeSession(sessionId, "no-data");
return res.status(422).json({ error: "No taxpayer data found for that GSTIN." });
}
// Success
closeSession(sessionId, "success");
const pradr = (raw.pradr as Record<string, unknown>)?.adr as string ?? "";
const pincode = pradr.match(/\b(\d{6})\b/)?.[1] ?? "";
const state = String(raw.stj ?? "").split(",")[0].replace(/^State\s*-\s*/i, "");
const result = {
legalName: raw.lgnm ?? "",
tradeName: raw.tradeNam ?? raw.lgnm ?? "",
address: pradr,
state,
pincode,
gstin: raw.gstin,
status: raw.sts ?? "",
businessType: raw.ctb ?? raw.dty ?? "",
registrationDate: raw.rgdt ?? "",
};
logger.info("GST search successful", {
reqId, sessionId, gstin, legalName: result.legalName, totalMs: elapsed(t),
});
return res.json(result);
} catch (e) {
logger.error("POST /search failed unexpectedly", {
reqId, sessionId, gstin, ms: elapsed(t), ...errCtx(e),
});
closeSession(sessionId, "exception");
return res.status(500).json({ error: "Internal error during GST lookup." });
}
});
// ── Start ─────────────────────────────────────────────────────────────────────
app.listen(PORT, () => {
logger.info("GstService listening", {
port: PORT,
sessionTtlMs: SESSION_TTL_MS,
logLevel: LOG_LEVEL,
});
});