501 lines
18 KiB
TypeScript
501 lines
18 KiB
TypeScript
import express from "express";
|
|
import { chromium, type Browser, type BrowserContext, type Page } from "playwright";
|
|
|
|
// ── Config ────────────────────────────────────────────────────────────────────
|
|
|
|
const PORT = Number(process.env.PORT ?? 3003);
|
|
const SESSION_TTL_MS = Number(process.env.SESSION_TTL_MS ?? 3 * 60 * 1000); // 3 min default
|
|
const LOG_LEVEL = (process.env.LOG_LEVEL ?? "INFO") as LogLevel;
|
|
|
|
// ── Structured logger ─────────────────────────────────────────────────────────
|
|
|
|
type LogLevel = "DEBUG" | "INFO" | "WARN" | "ERROR";
|
|
const LEVEL_RANK: Record<LogLevel, number> = { DEBUG: 0, INFO: 1, WARN: 2, ERROR: 3 };
|
|
|
|
function log(level: LogLevel, msg: string, ctx?: Record<string, unknown>): void {
|
|
if (LEVEL_RANK[level] < LEVEL_RANK[LOG_LEVEL]) return;
|
|
const entry: Record<string, unknown> = { ts: new Date().toISOString(), level, msg, ...ctx };
|
|
const line = JSON.stringify(entry);
|
|
// Errors and warnings go to stderr so process supervisors can separate them
|
|
if (level === "ERROR" || level === "WARN") process.stderr.write(line + "\n");
|
|
else process.stdout.write(line + "\n");
|
|
}
|
|
|
|
const logger = {
|
|
debug: (msg: string, ctx?: Record<string, unknown>) => log("DEBUG", msg, ctx),
|
|
info: (msg: string, ctx?: Record<string, unknown>) => log("INFO", msg, ctx),
|
|
warn: (msg: string, ctx?: Record<string, unknown>) => log("WARN", msg, ctx),
|
|
error: (msg: string, ctx?: Record<string, unknown>) => log("ERROR", msg, ctx),
|
|
};
|
|
|
|
function elapsed(startMs: number): number { return Date.now() - startMs; }
|
|
|
|
/** Extract loggable fields from any thrown value. */
|
|
function errCtx(e: unknown): Record<string, unknown> {
|
|
if (e instanceof Error) {
|
|
return {
|
|
errName: e.name,
|
|
errMsg: e.message,
|
|
// First 6 frames — enough for diagnosis without flooding logs
|
|
stack: e.stack?.split("\n").slice(0, 6).map((l) => l.trim()),
|
|
};
|
|
}
|
|
return { err: String(e) };
|
|
}
|
|
|
|
// ── Process-level resilience ──────────────────────────────────────────────────
|
|
|
|
process.on("uncaughtException", (e) => logger.error("Uncaught exception", errCtx(e)));
|
|
process.on("unhandledRejection", (reason) => logger.error("Unhandled promise rejection", errCtx(reason)));
|
|
|
|
// ── Singleton browser ─────────────────────────────────────────────────────────
|
|
|
|
let _browser: Browser | null = null;
|
|
|
|
async function getBrowser(): Promise<Browser> {
|
|
if (_browser?.isConnected()) return _browser;
|
|
|
|
const t = Date.now();
|
|
logger.info("Launching Chromium");
|
|
_browser = await chromium.launch({
|
|
headless: true,
|
|
args: ["--no-sandbox", "--disable-setuid-sandbox"],
|
|
});
|
|
_browser.on("disconnected", () => {
|
|
logger.warn("Browser disconnected — will relaunch on next request");
|
|
_browser = null;
|
|
});
|
|
logger.info("Chromium ready", { ms: elapsed(t) });
|
|
return _browser;
|
|
}
|
|
|
|
// ── Session store ─────────────────────────────────────────────────────────────
|
|
|
|
/** One captcha image fetched within a session. */
|
|
type CaptchaEntry = {
|
|
captchaId: string;
|
|
b64: string;
|
|
fetchedAt: number;
|
|
};
|
|
|
|
/**
|
|
* A browser context + page kept alive for one GST lookup.
|
|
* Multiple captcha images can be fetched into `captchas` without reloading
|
|
* the page — useful when the first image is unreadable or the user types it
|
|
* wrong and wants to retry without paying the cost of a full page reload.
|
|
*/
|
|
type Session = {
|
|
sessionId: string;
|
|
ctx: BrowserContext;
|
|
page: Page;
|
|
captchas: CaptchaEntry[]; // ordered oldest→newest
|
|
createdAt: number;
|
|
lastUsedAt: number;
|
|
expires: number;
|
|
};
|
|
|
|
const sessions = new Map<string, Session>();
|
|
|
|
function makeId(): string {
|
|
return Math.random().toString(36).slice(2) + Date.now().toString(36);
|
|
}
|
|
|
|
/** Close + remove all sessions whose TTL has passed. */
|
|
function pruneExpired(): void {
|
|
const now = Date.now();
|
|
let pruned = 0;
|
|
for (const [id, s] of sessions) {
|
|
if (s.expires < now) {
|
|
s.ctx.close().catch((e) =>
|
|
logger.warn("Error closing expired ctx", { sessionId: id, ...errCtx(e) })
|
|
);
|
|
sessions.delete(id);
|
|
pruned++;
|
|
}
|
|
}
|
|
if (pruned > 0) logger.info("Pruned expired sessions", { pruned, remaining: sessions.size });
|
|
}
|
|
|
|
/** Look up a live session; returns null and cleans up if missing or expired. */
|
|
function getSession(sessionId: string): Session | null {
|
|
const s = sessions.get(sessionId);
|
|
if (!s) return null;
|
|
if (s.expires < Date.now()) {
|
|
sessions.delete(sessionId);
|
|
s.ctx.close().catch((e) =>
|
|
logger.warn("Error closing expired ctx on access", { sessionId, ...errCtx(e) })
|
|
);
|
|
logger.info("Session expired on access", { sessionId });
|
|
return null;
|
|
}
|
|
s.lastUsedAt = Date.now();
|
|
return s;
|
|
}
|
|
|
|
/** Close the browser context and remove from the map. */
|
|
function closeSession(sessionId: string, reason = "normal"): void {
|
|
const s = sessions.get(sessionId);
|
|
if (!s) return;
|
|
sessions.delete(sessionId);
|
|
logger.info("Session closed", { sessionId, reason, captchaCount: s.captchas.length });
|
|
s.ctx.close().catch((e) =>
|
|
logger.warn("Error closing ctx", { sessionId, ...errCtx(e) })
|
|
);
|
|
}
|
|
|
|
// ── Playwright page helpers ───────────────────────────────────────────────────
|
|
|
|
/** Wire up page events so Playwright activity surfaces in structured logs. */
|
|
function attachPageListeners(page: Page, sessionId: string): void {
|
|
page.on("console", (msg) => {
|
|
const type = msg.type();
|
|
if (type === "error" || type === "warning") {
|
|
logger.debug("Page console", { sessionId, type, text: msg.text() });
|
|
}
|
|
});
|
|
page.on("pageerror", (err) => {
|
|
logger.warn("Uncaught JS error on page", { sessionId, ...errCtx(err) });
|
|
});
|
|
page.on("requestfailed", (req) => {
|
|
logger.warn("Network request failed", {
|
|
sessionId,
|
|
url: req.url(),
|
|
failure: req.failure()?.errorText,
|
|
});
|
|
});
|
|
page.on("response", (resp) => {
|
|
const url = resp.url();
|
|
const status = resp.status();
|
|
if (
|
|
status >= 400 &&
|
|
(url.includes("captcha") || url.includes("searchtp") || url.includes("/api/search"))
|
|
) {
|
|
logger.warn("GST portal returned HTTP error on key endpoint", { sessionId, url, status });
|
|
}
|
|
});
|
|
}
|
|
|
|
/** Call /services/captcha from within the page context to get a fresh image. */
|
|
async function fetchCaptchaFromPage(page: Page, sessionId: string): Promise<string> {
|
|
const t = Date.now();
|
|
logger.debug("Fetching captcha image from GST portal", { sessionId });
|
|
|
|
const b64: string = await page.evaluate(() =>
|
|
fetch("/services/captcha", { headers: { Accept: "image/png,image/*" } })
|
|
.then((r) => {
|
|
if (!r.ok) throw new Error(`Captcha endpoint returned HTTP ${r.status}`);
|
|
return r.blob();
|
|
})
|
|
.then(
|
|
(blob) =>
|
|
new Promise<string>((resolve, reject) => {
|
|
const reader = new FileReader();
|
|
reader.onload = () => resolve((reader.result as string).split(",")[1]);
|
|
reader.onerror = () => reject(new Error("FileReader failed reading captcha blob"));
|
|
reader.readAsDataURL(blob);
|
|
})
|
|
)
|
|
);
|
|
|
|
if (!b64 || b64.length < 100) {
|
|
throw new Error(`Captcha response looks invalid (base64 length=${b64?.length ?? 0})`);
|
|
}
|
|
|
|
logger.debug("Captcha image ready", { sessionId, b64Len: b64.length, ms: elapsed(t) });
|
|
return b64;
|
|
}
|
|
|
|
// ── Express app ───────────────────────────────────────────────────────────────
|
|
|
|
const app = express();
|
|
app.use(express.json());
|
|
|
|
// CORS — allow Pelagia portal to call us from any origin
|
|
app.use((_req, res, next) => {
|
|
res.setHeader("Access-Control-Allow-Origin", "*");
|
|
res.setHeader("Access-Control-Allow-Headers", "Content-Type");
|
|
next();
|
|
});
|
|
|
|
// ── Per-request ID + response logging ────────────────────────────────────────
|
|
type TrackedReq = express.Request & { reqId: string; startMs: number };
|
|
|
|
app.use((req, res, next) => {
|
|
const r = req as TrackedReq;
|
|
r.reqId = makeId();
|
|
r.startMs = Date.now();
|
|
|
|
res.on("finish", () => {
|
|
const level: LogLevel =
|
|
res.statusCode >= 500 ? "ERROR" :
|
|
res.statusCode >= 400 ? "WARN" : "INFO";
|
|
log(level, `${req.method} ${req.path}`, {
|
|
reqId: r.reqId,
|
|
status: res.statusCode,
|
|
ms: elapsed(r.startMs),
|
|
sessions: sessions.size,
|
|
});
|
|
});
|
|
|
|
next();
|
|
});
|
|
|
|
// ── GET /health ───────────────────────────────────────────────────────────────
|
|
app.get("/health", (_req, res) => {
|
|
const now = Date.now();
|
|
res.json({
|
|
ok: true,
|
|
browserConnected: _browser?.isConnected() ?? false,
|
|
sessionCount: sessions.size,
|
|
activeSessions: [...sessions.values()].map((s) => ({
|
|
sessionId: s.sessionId,
|
|
captchaCount: s.captchas.length,
|
|
expiresInMs: s.expires - now,
|
|
lastUsedMsAgo: now - s.lastUsedAt,
|
|
})),
|
|
});
|
|
});
|
|
|
|
// ── GET /captcha — create new session ─────────────────────────────────────────
|
|
/**
|
|
* Loads the GST search page in a fresh browser context, fetches the first
|
|
* captcha image, and returns a session that can be reused for retries.
|
|
*
|
|
* Response: { sessionId, captchaId, captchaBase64 }
|
|
*/
|
|
app.get("/captcha", async (req, res) => {
|
|
pruneExpired();
|
|
const { reqId } = req as TrackedReq;
|
|
const t = Date.now();
|
|
let ctx: BrowserContext | undefined;
|
|
|
|
try {
|
|
const browser = await getBrowser();
|
|
logger.debug("Creating browser context", { reqId });
|
|
|
|
ctx = await browser.newContext({
|
|
userAgent:
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
|
viewport: { width: 1280, height: 900 },
|
|
});
|
|
const page = await ctx.newPage();
|
|
const sessionId = makeId();
|
|
attachPageListeners(page, sessionId);
|
|
|
|
// Navigate — this establishes the portal's session cookies
|
|
logger.info("Navigating to GST search page", { reqId, sessionId });
|
|
const navT = Date.now();
|
|
await page.goto("https://services.gst.gov.in/services/searchtp", {
|
|
waitUntil: "networkidle",
|
|
timeout: 30_000,
|
|
});
|
|
logger.info("GST portal loaded", { reqId, sessionId, ms: elapsed(navT) });
|
|
|
|
// First captcha
|
|
const b64 = await fetchCaptchaFromPage(page, sessionId);
|
|
const captchaId = makeId();
|
|
|
|
sessions.set(sessionId, {
|
|
sessionId,
|
|
ctx,
|
|
page,
|
|
captchas: [{ captchaId, b64, fetchedAt: Date.now() }],
|
|
createdAt: Date.now(),
|
|
lastUsedAt: Date.now(),
|
|
expires: Date.now() + SESSION_TTL_MS,
|
|
});
|
|
|
|
logger.info("Session created", { reqId, sessionId, captchaId, totalMs: elapsed(t) });
|
|
return res.json({ sessionId, captchaId, captchaBase64: b64 });
|
|
|
|
} catch (e) {
|
|
logger.error("GET /captcha failed", { reqId, totalMs: elapsed(t), ...errCtx(e) });
|
|
ctx?.close().catch(() => {});
|
|
return res.status(502).json({ error: "Failed to fetch CAPTCHA from GST portal. Please try again." });
|
|
}
|
|
});
|
|
|
|
// ── GET /captcha/:sessionId — refresh captcha within existing session ──────────
|
|
/**
|
|
* Fetches a new captcha image using the SAME browser context and page —
|
|
* no page reload. The GST portal's /services/captcha endpoint issues a fresh
|
|
* image (and updates CaptchaCookie) on each call.
|
|
*
|
|
* Use this after a SWEB_9034 (wrong captcha) response so the user can retry
|
|
* without the latency of a new page load.
|
|
*
|
|
* Response: { captchaId, captchaBase64, totalCaptchas }
|
|
*/
|
|
app.get("/captcha/:sessionId", async (req, res) => {
|
|
const { sessionId } = req.params;
|
|
const { reqId } = req as unknown as TrackedReq;
|
|
const t = Date.now();
|
|
|
|
const session = getSession(sessionId);
|
|
if (!session) {
|
|
logger.warn("Captcha refresh: session not found or expired", { reqId, sessionId });
|
|
return res.status(410).json({ error: "Session expired — please start a new lookup." });
|
|
}
|
|
|
|
try {
|
|
const b64 = await fetchCaptchaFromPage(session.page, sessionId);
|
|
const captchaId = makeId();
|
|
|
|
session.captchas.push({ captchaId, b64, fetchedAt: Date.now() });
|
|
session.expires = Date.now() + SESSION_TTL_MS; // reset TTL on activity
|
|
|
|
logger.info("Captcha refreshed", {
|
|
reqId,
|
|
sessionId,
|
|
captchaId,
|
|
totalCaptchas: session.captchas.length,
|
|
ms: elapsed(t),
|
|
});
|
|
return res.json({ captchaId, captchaBase64: b64, totalCaptchas: session.captchas.length });
|
|
|
|
} catch (e) {
|
|
logger.error("GET /captcha/:sessionId failed", { reqId, sessionId, ms: elapsed(t), ...errCtx(e) });
|
|
return res.status(502).json({ error: "Failed to refresh CAPTCHA." });
|
|
}
|
|
});
|
|
|
|
// ── POST /search ───────────────────────────────────────────────────────────────
|
|
/**
|
|
* Body: { sessionId, gstin, captcha }
|
|
*
|
|
* Success: closes session, returns taxpayer data.
|
|
* SWEB_9034: keeps session alive, returns { canRefresh: true, sessionId }
|
|
* — caller should GET /captcha/:sessionId for a fresh image.
|
|
* Other error: closes session, returns { error }.
|
|
*/
|
|
app.post("/search", async (req, res) => {
|
|
const { sessionId, gstin, captcha } = req.body ?? {};
|
|
const { reqId } = req as TrackedReq;
|
|
const t = Date.now();
|
|
|
|
if (!sessionId || !gstin || !captcha) {
|
|
return res.status(400).json({ error: "sessionId, gstin and captcha are required" });
|
|
}
|
|
|
|
const session = getSession(sessionId);
|
|
if (!session) {
|
|
logger.warn("Search: session not found or expired", { reqId, sessionId, gstin });
|
|
return res.status(410).json({ error: "Session expired — please fetch a new CAPTCHA." });
|
|
}
|
|
|
|
logger.info("Submitting GST search", {
|
|
reqId,
|
|
sessionId,
|
|
gstin,
|
|
captchaLen: captcha.length,
|
|
captchaCount: session.captchas.length,
|
|
});
|
|
|
|
try {
|
|
const searchT = Date.now();
|
|
const raw: Record<string, unknown> = await session.page.evaluate(
|
|
([g, c]: [string, string]) =>
|
|
fetch("/services/api/search/tp", {
|
|
method: "POST",
|
|
headers: {
|
|
Accept: "application/json, text/plain",
|
|
"Content-Type": "application/json;charset=UTF-8",
|
|
},
|
|
body: JSON.stringify({ gstin: g, captcha: c }),
|
|
})
|
|
.then(async (r) => r.json().catch(() => ({ error: `HTTP ${r.status}` })))
|
|
.catch((e: Error) => ({ error: e.message })),
|
|
[gstin, captcha] as [string, string]
|
|
);
|
|
logger.debug("GST portal raw response", {
|
|
reqId, sessionId, gstin, ms: elapsed(searchT), raw,
|
|
});
|
|
|
|
// Wrong captcha — keep session alive so caller can refresh
|
|
if (raw.errorCode === "SWEB_9034") {
|
|
logger.warn("Wrong captcha — session kept alive for refresh", {
|
|
reqId, sessionId, gstin,
|
|
captchaCount: session.captchas.length,
|
|
ms: elapsed(t),
|
|
});
|
|
return res.status(422).json({
|
|
error: "Wrong CAPTCHA — please try again.",
|
|
canRefresh: true,
|
|
sessionId,
|
|
});
|
|
}
|
|
|
|
// Portal session/auth expired
|
|
if (raw.errorCode === "SWEB_9000") {
|
|
logger.warn("GST portal session expired (SWEB_9000)", { reqId, sessionId, gstin, ms: elapsed(t) });
|
|
closeSession(sessionId, "SWEB_9000");
|
|
return res.status(502).json({ error: "GST portal session expired. Please start a new lookup." });
|
|
}
|
|
|
|
// Other portal error codes
|
|
if (raw.errorCode) {
|
|
logger.warn("GST portal error code", {
|
|
reqId, sessionId, gstin, errorCode: raw.errorCode, ms: elapsed(t),
|
|
});
|
|
closeSession(sessionId, `errorCode:${raw.errorCode}`);
|
|
return res.status(422).json({ error: `GST portal error: ${raw.errorCode}` });
|
|
}
|
|
|
|
// Network / fetch error from page.evaluate
|
|
if (raw.error) {
|
|
logger.error("GST search network error", {
|
|
reqId, sessionId, gstin, error: raw.error, ms: elapsed(t),
|
|
});
|
|
closeSession(sessionId, "network-error");
|
|
return res.status(502).json({ error: String(raw.error) });
|
|
}
|
|
|
|
// Empty / unexpected response
|
|
if (!raw.gstin) {
|
|
logger.warn("No GSTIN in GST portal response", { reqId, sessionId, gstin, ms: elapsed(t) });
|
|
closeSession(sessionId, "no-data");
|
|
return res.status(422).json({ error: "No taxpayer data found for that GSTIN." });
|
|
}
|
|
|
|
// Success
|
|
closeSession(sessionId, "success");
|
|
|
|
const pradr = (raw.pradr as Record<string, unknown>)?.adr as string ?? "";
|
|
const pincode = pradr.match(/\b(\d{6})\b/)?.[1] ?? "";
|
|
const state = String(raw.stj ?? "").split(",")[0].replace(/^State\s*-\s*/i, "");
|
|
|
|
const result = {
|
|
legalName: raw.lgnm ?? "",
|
|
tradeName: raw.tradeNam ?? raw.lgnm ?? "",
|
|
address: pradr,
|
|
state,
|
|
pincode,
|
|
gstin: raw.gstin,
|
|
status: raw.sts ?? "",
|
|
businessType: raw.ctb ?? raw.dty ?? "",
|
|
registrationDate: raw.rgdt ?? "",
|
|
};
|
|
|
|
logger.info("GST search successful", {
|
|
reqId, sessionId, gstin, legalName: result.legalName, totalMs: elapsed(t),
|
|
});
|
|
return res.json(result);
|
|
|
|
} catch (e) {
|
|
logger.error("POST /search failed unexpectedly", {
|
|
reqId, sessionId, gstin, ms: elapsed(t), ...errCtx(e),
|
|
});
|
|
closeSession(sessionId, "exception");
|
|
return res.status(500).json({ error: "Internal error during GST lookup." });
|
|
}
|
|
});
|
|
|
|
// ── Start ─────────────────────────────────────────────────────────────────────
|
|
|
|
app.listen(PORT, () => {
|
|
logger.info("GstService listening", {
|
|
port: PORT,
|
|
sessionTtlMs: SESSION_TTL_MS,
|
|
logLevel: LOG_LEVEL,
|
|
});
|
|
});
|