pelagia-portal/GstService/src/test-lookup.ts
Hardik f372fae953 feat(gst): replace API-key lookup with Playwright microservice
Problem: GST portal's public taxpayer search (services.gst.gov.in/
searchtp) now requires human CAPTCHA verification but no login.
The BIG-IP WAF blocks direct Node.js HTTP clients via TLS
fingerprinting; Playwright (real Chromium) bypasses it successfully.
Confirmed working: GSTIN 27AAHCP5787B1Z6 → full PELAGIA MARINE
SERVICES data including address, jurisdiction, filing status.

GstService/ (new standalone microservice):
- src/index.ts: Express + Playwright singleton browser
  GET  /health  → { ok: true }
  GET  /captcha → launches browser, loads GST portal, fetches
                  CAPTCHA image from same origin (sets CaptchaCookie),
                  stores BrowserContext in session map (3 min TTL)
                  → { sessionId, captchaBase64 }
  POST /search  → { sessionId, gstin, captcha } → submits form
                  via page.evaluate fetch() using live browser session,
                  closes context, returns parsed taxpayer data
- package.json, tsconfig.json, npm install
- src/test-lookup.ts: interactive CLI test (prompted user for captcha)

App changes:
- Remove playwright dep from Next.js app (was incorrectly added)
- Remove lib/gst-lookup.ts (sandbox.co.in placeholder — unused)
- Remove lib/gst-browser.ts (Playwright singleton — moved to service)
- app/api/gst/captcha/route.ts: thin proxy → GST_SERVICE_URL/captcha
- app/api/gst/route.ts: thin proxy POST → GST_SERVICE_URL/search
- vendor-form.tsx: two-step captcha UI
    Step 1: "Look up" → calls /api/gst/captcha → shows PNG inline
    Step 2: user types 6 digits → "Verify" → calls /api/gst → fills
            form (name, address, lat/lng from Nominatim geocoding)
    Wrong captcha → SWEB_9034 error with retry option
- .env.example: GST_SERVICE_URL=http://localhost:3002

Start the microservice: cd GstService && npm run dev

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 13:27:15 +05:30

108 lines
4.5 KiB
TypeScript

/**
* Interactive GST taxpayer lookup via services.gst.gov.in/services/searchtp
* Run: npx tsx scripts/test-gst-scrape.ts <GSTIN>
*
* Flow:
* 1. Load the search page (establishes session)
* 2. Fetch the CAPTCHA image → save to scripts/captcha.png
* 3. Prompt you to open the image and type the 6 digits
* 4. Submit GSTIN + captcha → print the result
*/
import { chromium } from "playwright";
import * as fs from "fs";
import * as readline from "readline";
const GSTIN = (process.argv[2] ?? "").toUpperCase();
if (!GSTIN) { console.error("Usage: npx tsx scripts/test-gst-scrape.ts <GSTIN>"); process.exit(1); }
function ask(q: string): Promise<string> {
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
return new Promise(resolve => rl.question(q, a => { rl.close(); resolve(a.trim()); }));
}
(async () => {
console.log(`\nGSTIN: ${GSTIN}\n`);
const browser = await chromium.launch({ headless: true });
const ctx = await browser.newContext({
userAgent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
viewport: { width: 1280, height: 900 },
});
const page = await ctx.newPage();
// ── Step 1: load search page ──────────────────────────────────────────────
process.stdout.write("Loading GST portal… ");
await page.goto("https://services.gst.gov.in/services/searchtp", {
waitUntil: "networkidle",
timeout: 30000,
});
console.log("done.");
// ── Step 2: fetch captcha from same origin (sets CaptchaCookie) ───────────
process.stdout.write("Fetching CAPTCHA… ");
const captchaB64: string = await page.evaluate(() =>
fetch("/services/captcha", { headers: { Accept: "image/png,image/*" } })
.then(r => r.blob())
.then(blob => new Promise<string>((res, rej) => {
const reader = new FileReader();
reader.onload = () => res((reader.result as string).split(",")[1]);
reader.onerror = rej;
reader.readAsDataURL(blob);
}))
);
const imgPath = "scripts/captcha.png";
fs.writeFileSync(imgPath, Buffer.from(captchaB64, "base64"));
console.log(`saved → ${imgPath}`);
const cookies = await ctx.cookies("https://services.gst.gov.in");
const capCookie = cookies.find(c => c.name === "CaptchaCookie");
console.log(`CaptchaCookie: ${capCookie?.value ?? "NOT SET"}`);
// ── Step 3: ask for captcha answer ────────────────────────────────────────
console.log("\nOpen scripts/captcha.png and read the 6-digit number.");
const captcha = await ask("Enter CAPTCHA (6 digits): ");
if (!/^\d{6}$/.test(captcha)) {
console.error("Expected exactly 6 digits. Got:", captcha);
await browser.close();
process.exit(1);
}
// ── Step 4: submit search ─────────────────────────────────────────────────
process.stdout.write(`\nSubmitting { gstin: "${GSTIN}", captcha: "${captcha}" }… `);
const result: { status: number; body: unknown } = await page.evaluate(
([gstin, cap]: [string, string]) =>
fetch("/services/api/search/tp", {
method: "POST",
headers: {
"Accept": "application/json, text/plain",
"Content-Type": "application/json;charset=UTF-8",
},
body: JSON.stringify({ gstin, captcha: cap }),
})
.then(async r => ({ status: r.status, body: await r.json().catch(() => r.text()) }))
.catch((e: Error) => ({ status: 0, body: { error: e.message } })),
[GSTIN, captcha] as [string, string]
);
console.log("done.\n");
console.log("=== Response ===");
console.log(JSON.stringify(result.body, null, 2));
// If wrong captcha, SWEB_9034; if GSTIN not found, different code; on success → data
const body = result.body as Record<string, unknown>;
if (body.errorCode === "SWEB_9034") {
console.log("\n→ Wrong CAPTCHA. Re-run to get a fresh image.");
} else if (body.errorCode === "SWEB_9000") {
console.log("\n→ SWEB_9000 (session/auth issue — not a captcha problem).");
} else if (body.errorCode) {
console.log(`\n→ Error code: ${body.errorCode}`);
} else {
console.log("\n✅ Success — taxpayer data above.");
}
await browser.close();
})().catch(e => { console.error("\nError:", e.message); process.exit(1); });