rabbitmq-publisher.service.ts 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010
  1. // apps/box-app-api/src/rabbitmq/rabbitmq-publisher.service.ts
  2. import {
  3. Injectable,
  4. Logger,
  5. OnModuleDestroy,
  6. OnModuleInit,
  7. } from '@nestjs/common';
  8. import { ConfigService } from '@nestjs/config';
  9. import { Connection, ConfirmChannel } from 'amqplib';
  10. import * as amqp from 'amqplib';
  11. import { UserLoginEventPayload } from '@box/common/events/user-login-event.dto';
  12. import { AdsClickEventPayload } from '@box/common/events/ads-click-event.dto';
  13. import { nowEpochMsBigInt } from '@box/common/time/time.util';
  14. import { RedisService } from '@box/db/redis/redis.service';
  15. type StatsAdClickRoutingKey = string;
  16. type StatsVideoClickRoutingKey = string;
  17. type StatsAdImpressionRoutingKey = string;
  18. export interface StatsAdClickEventPayload {
  19. messageId: string;
  20. uid: string;
  21. adId: string;
  22. adType: string;
  23. clickedAt: bigint;
  24. ip: string;
  25. channelId?: string;
  26. machine?: string;
  27. }
  28. export interface StatsVideoClickEventPayload {
  29. messageId: string;
  30. uid: string;
  31. videoId: string;
  32. clickedAt: bigint;
  33. ip: string;
  34. }
  35. export interface StatsAdImpressionEventPayload {
  36. messageId: string;
  37. uid: string;
  38. adId: string;
  39. adType: string;
  40. impressionAt: bigint;
  41. visibleDurationMs?: number;
  42. ip: string;
  43. channelId?: string;
  44. machine?: string;
  45. }
  46. // Circuit breaker states
  47. enum CircuitBreakerState {
  48. CLOSED = 'CLOSED', // Normal operation
  49. OPEN = 'OPEN', // Failing, reject requests
  50. HALF_OPEN = 'HALF_OPEN', // Testing if service recovered
  51. }
  52. interface CircuitBreakerConfig {
  53. failureThreshold: number; // Number of failures to open circuit
  54. successThreshold: number; // Number of successes to close circuit
  55. timeout: number; // Time in ms to wait before trying again (half-open)
  56. }
  57. /**
  58. * RabbitMQ Publisher Service
  59. *
  60. * Responsible for publishing analytics events to RabbitMQ with comprehensive error handling:
  61. *
  62. * PRIMARY RESPONSIBILITIES:
  63. * 1. Publish stats events (stats.ad.click, stats.video.click, stats.ad.impression)
  64. * 2. Publish user activity events (user.login, ads.click) with simplified resilience
  65. * 3. Maintain connection health with automatic reconnection
  66. *
  67. * RESILIENCE FEATURES:
  68. * - Circuit Breaker: Prevents overwhelming failed RabbitMQ (CLOSED/OPEN/HALF_OPEN states)
  69. * - Retry Logic: 3 attempts with exponential backoff (100ms, 500ms, 2000ms)
  70. * - Redis Fallback Queue: 24-hour TTL for stats events during outages
  71. * - Dead Letter Queue (DLQ): Manual inspection of permanently failed messages
  72. * - Publisher Idempotency: 7-day deduplication window using Redis (stats events only)
  73. * - Message TTL: 24-hour expiration for stats events
  74. * - Automatic Reconnection: Self-healing when RabbitMQ recovers
  75. *
  76. * EVENT TIERS:
  77. * - Tier 1 (Full Resilience): stats.* events with fallback/DLQ/idempotency
  78. * - Tier 2 (Partial Resilience): user.login, ads.click with circuit breaker + retries only
  79. *
  80. * OBSERVABILITY:
  81. * - Structured logs with circuit state, retry counts, and routing context
  82. * - Health status endpoint via getCircuitStatus()
  83. * - Clear warnings when events are dropped due to circuit breaker
  84. *
  85. * @see OBSERVABILITY_ENHANCEMENTS.md for monitoring guide
  86. * @see RABBITMQ_FALLBACK_REPLAY.md for recovery procedures
  87. */
  88. @Injectable()
  89. export class RabbitmqPublisherService implements OnModuleInit, OnModuleDestroy {
  90. private readonly logger = new Logger(RabbitmqPublisherService.name);
  91. private connection?: Connection;
  92. private channel?: ConfirmChannel;
  93. private exchange!: string;
  94. private routingKeyLogin!: string;
  95. private routingKeyAdsClick!: string;
  96. private statsExchange!: string;
  97. private routingKeyStatsAdClick!: StatsAdClickRoutingKey;
  98. private routingKeyStatsVideoClick!: StatsVideoClickRoutingKey;
  99. private routingKeyStatsAdImpression!: StatsAdImpressionRoutingKey;
  100. private dlqExchange!: string;
  101. // Circuit breaker state
  102. private circuitState: CircuitBreakerState = CircuitBreakerState.CLOSED;
  103. private failureCount = 0;
  104. private successCount = 0;
  105. private nextAttemptTime = 0;
  106. private readonly circuitConfig: CircuitBreakerConfig = {
  107. failureThreshold: 5, // Open circuit after 5 failures
  108. successThreshold: 2, // Close circuit after 2 successes
  109. timeout: 60000, // Wait 60s before trying again
  110. };
  111. // Reconnection state
  112. private isReconnecting = false;
  113. private reconnectionScheduled = false;
  114. // Retry configuration
  115. private readonly maxRetries = 3;
  116. private readonly retryDelays = [100, 500, 2000]; // Exponential backoff
  117. // Message TTL (24 hours for fallback queue)
  118. private readonly messageTTL = 86400000; // 24 hours in ms
  119. private readonly idempotencyTTL = 604800; // 7 days in seconds
  120. constructor(
  121. private readonly config: ConfigService,
  122. private readonly redis: RedisService,
  123. ) {}
  124. async onModuleInit(): Promise<void> {
  125. const url = this.config.get<string>('RABBITMQ_URL');
  126. this.exchange =
  127. this.config.get<string>('RABBITMQ_LOGIN_EXCHANGE') ?? 'stats.user';
  128. this.routingKeyLogin =
  129. this.config.get<string>('RABBITMQ_LOGIN_ROUTING_KEY') ?? 'user.login';
  130. this.routingKeyAdsClick =
  131. this.config.get<string>('RABBITMQ_ADS_CLICK_ROUTING_KEY') ?? 'ads.click';
  132. this.statsExchange =
  133. this.config.get<string>('RABBITMQ_STATS_EXCHANGE') ?? this.exchange;
  134. this.routingKeyStatsAdClick =
  135. this.config.get<string>('RABBITMQ_STATS_AD_CLICK_ROUTING_KEY') ??
  136. 'stats.ad.click';
  137. this.routingKeyStatsVideoClick =
  138. this.config.get<string>('RABBITMQ_STATS_VIDEO_CLICK_ROUTING_KEY') ??
  139. 'stats.video.click';
  140. this.routingKeyStatsAdImpression =
  141. this.config.get<string>('RABBITMQ_STATS_AD_IMPRESSION_ROUTING_KEY') ??
  142. 'stats.ad.impression';
  143. this.dlqExchange =
  144. this.config.get<string>('RABBITMQ_DLQ_EXCHANGE') ?? 'dlq.stats';
  145. if (!url) {
  146. this.logger.error(
  147. 'RABBITMQ_URL is not set. Stats will be stored in Redis fallback queue only.',
  148. );
  149. this.circuitState = CircuitBreakerState.OPEN;
  150. return;
  151. }
  152. try {
  153. this.logger.log(`Connecting to RabbitMQ at ${url} ...`);
  154. await this.initializeConnection(url);
  155. this.logger.log('RabbitMQ connection initialized successfully');
  156. } catch (error) {
  157. this.logger.error(
  158. `Failed to initialize RabbitMQ connection: ${error}`,
  159. error instanceof Error ? error.stack : undefined,
  160. );
  161. this.circuitState = CircuitBreakerState.OPEN;
  162. this.nextAttemptTime = Date.now() + this.circuitConfig.timeout;
  163. }
  164. }
  165. private async initializeConnection(url: string): Promise<void> {
  166. this.connection = await amqp.connect(url);
  167. // Handle connection errors
  168. this.connection.on('error', (err) => {
  169. this.logger.error('RabbitMQ connection error:', err);
  170. this.openCircuit();
  171. });
  172. this.connection.on('close', () => {
  173. this.logger.warn('RabbitMQ connection closed');
  174. this.openCircuit();
  175. });
  176. // Use a confirm channel so we know when broker has accepted the message
  177. this.channel = await this.connection.createConfirmChannel();
  178. // Handle channel errors
  179. this.channel.on('error', (err) => {
  180. this.logger.error('RabbitMQ channel error:', err);
  181. this.openCircuit();
  182. });
  183. this.channel.on('close', () => {
  184. this.logger.warn('RabbitMQ channel closed');
  185. this.openCircuit();
  186. });
  187. // Assert exchanges with DLQ
  188. await this.channel.assertExchange(this.exchange, 'topic', {
  189. durable: true,
  190. });
  191. if (this.statsExchange !== this.exchange) {
  192. await this.channel.assertExchange(this.statsExchange, 'topic', {
  193. durable: true,
  194. });
  195. }
  196. // Assert Dead Letter Exchange
  197. await this.channel.assertExchange(this.dlqExchange, 'topic', {
  198. durable: true,
  199. });
  200. // Assert DLQ queue for stats events
  201. await this.channel.assertQueue('dlq.stats.events', {
  202. durable: true,
  203. arguments: {
  204. 'x-message-ttl': this.messageTTL, // Messages expire after 24 hours
  205. 'x-max-length': 100000, // Maximum 100k messages in DLQ
  206. },
  207. });
  208. // Bind DLQ queue to DLQ exchange
  209. // Routing convention: sendToDLQ() publishes with 'dlq.{original-routing-key}' format
  210. // Examples: dlq.stats.ad.click, dlq.stats.video.click, dlq.stats.ad.impression
  211. // Pattern 'dlq.#' matches all DLQ messages regardless of their original routing key
  212. await this.channel.bindQueue('dlq.stats.events', this.dlqExchange, 'dlq.#');
  213. this.logger.log(
  214. `RabbitMQ publisher ready. exchange="${this.exchange}", statsExchange="${this.statsExchange}", dlqExchange="${this.dlqExchange}"`,
  215. );
  216. }
  217. async onModuleDestroy(): Promise<void> {
  218. try {
  219. await this.channel?.close();
  220. await this.connection?.close();
  221. } catch (error: any) {
  222. this.logger.error('Error while closing RabbitMQ connection', error.stack);
  223. }
  224. }
  225. /**
  226. * Circuit breaker: Open circuit (stop attempting to send to RabbitMQ)
  227. */
  228. private openCircuit(): void {
  229. if (this.circuitState !== CircuitBreakerState.OPEN) {
  230. this.logger.warn(
  231. `Circuit breaker OPENED (failureCount=${this.failureCount}, successCount=${this.successCount}). Will retry after ${this.circuitConfig.timeout}ms`,
  232. );
  233. this.circuitState = CircuitBreakerState.OPEN;
  234. this.failureCount = 0;
  235. this.successCount = 0;
  236. this.nextAttemptTime = Date.now() + this.circuitConfig.timeout;
  237. // Schedule reconnection attempt
  238. this.scheduleReconnection();
  239. }
  240. }
  241. /**
  242. * Circuit breaker: Move to half-open state (test if service recovered)
  243. */
  244. private async halfOpenCircuit(): Promise<void> {
  245. this.logger.log(
  246. `Circuit breaker HALF-OPEN (failureCount=${this.failureCount}, successCount=${this.successCount}). Testing connection...`,
  247. );
  248. this.circuitState = CircuitBreakerState.HALF_OPEN;
  249. this.successCount = 0;
  250. // Attempt reconnection before allowing publish attempts
  251. await this.reconnectIfNeeded();
  252. }
  253. /**
  254. * Circuit breaker: Close circuit (resume normal operation)
  255. */
  256. private closeCircuit(): void {
  257. this.logger.log(
  258. `Circuit breaker CLOSED (failureCount=${this.failureCount}, successCount=${this.successCount}). Resuming normal operation.`,
  259. );
  260. this.circuitState = CircuitBreakerState.CLOSED;
  261. this.failureCount = 0;
  262. this.successCount = 0;
  263. }
  264. /**
  265. * Record successful publish (for circuit breaker)
  266. */
  267. private recordSuccess(): void {
  268. this.failureCount = 0;
  269. if (this.circuitState === CircuitBreakerState.HALF_OPEN) {
  270. this.successCount++;
  271. if (this.successCount >= this.circuitConfig.successThreshold) {
  272. this.closeCircuit();
  273. }
  274. }
  275. }
  276. /**
  277. * Record failed publish (for circuit breaker)
  278. */
  279. private recordFailure(): void {
  280. this.failureCount++;
  281. if (this.circuitState === CircuitBreakerState.HALF_OPEN) {
  282. this.openCircuit();
  283. } else if (
  284. this.circuitState === CircuitBreakerState.CLOSED &&
  285. this.failureCount >= this.circuitConfig.failureThreshold
  286. ) {
  287. this.openCircuit();
  288. }
  289. }
  290. /**
  291. * Check if circuit breaker allows request
  292. */
  293. private async canAttempt(): Promise<boolean> {
  294. if (this.circuitState === CircuitBreakerState.CLOSED) {
  295. return true;
  296. }
  297. if (this.circuitState === CircuitBreakerState.HALF_OPEN) {
  298. return true;
  299. }
  300. // OPEN state: check if timeout elapsed
  301. if (Date.now() >= this.nextAttemptTime) {
  302. await this.halfOpenCircuit();
  303. return true;
  304. }
  305. return false;
  306. }
  307. /**
  308. * Schedule a reconnection attempt after circuit timeout
  309. */
  310. private scheduleReconnection(): void {
  311. if (this.reconnectionScheduled) {
  312. return; // Already scheduled
  313. }
  314. this.reconnectionScheduled = true;
  315. this.logger.debug(
  316. `Scheduling reconnection attempt in ${this.circuitConfig.timeout}ms`,
  317. );
  318. setTimeout(async () => {
  319. this.reconnectionScheduled = false;
  320. if (this.circuitState === CircuitBreakerState.OPEN) {
  321. await this.halfOpenCircuit();
  322. }
  323. }, this.circuitConfig.timeout);
  324. }
  325. /**
  326. * Reconnect to RabbitMQ if connection or channel is closed/undefined
  327. */
  328. private async reconnectIfNeeded(): Promise<void> {
  329. // Check if reconnection is needed
  330. const connectionClosed =
  331. !this.connection || this.connection.connection?.destroyed;
  332. const channelClosed = !this.channel;
  333. if (!connectionClosed && !channelClosed) {
  334. this.logger.debug(
  335. 'Connection and channel are healthy, no reconnection needed',
  336. );
  337. return;
  338. }
  339. // Prevent concurrent reconnection attempts
  340. if (this.isReconnecting) {
  341. this.logger.debug('Reconnection already in progress, skipping');
  342. return;
  343. }
  344. this.isReconnecting = true;
  345. this.logger.log(
  346. `🔄 Starting RabbitMQ reconnection attempt (circuitState=${this.circuitState})...`,
  347. );
  348. try {
  349. // Get current URL from config
  350. const url = this.config.get<string>('RABBITMQ_URL');
  351. if (!url) {
  352. this.logger.error(
  353. '❌ Reconnection failed: RABBITMQ_URL is not set. Cannot reconnect to RabbitMQ.',
  354. );
  355. this.isReconnecting = false;
  356. return;
  357. }
  358. // Close existing connections if any
  359. try {
  360. await this.channel?.close();
  361. } catch (err) {
  362. // Ignore errors on close
  363. }
  364. try {
  365. await this.connection?.close();
  366. } catch (err) {
  367. // Ignore errors on close
  368. }
  369. // Clear references
  370. this.channel = undefined;
  371. this.connection = undefined;
  372. // Reinitialize connection
  373. this.logger.debug(`🔌 Reconnecting to RabbitMQ at ${url}...`);
  374. await this.initializeConnection(url);
  375. this.logger.log(
  376. `✅ RabbitMQ reconnection successful (hasConnection=${!!this.connection}, hasChannel=${!!this.channel})`,
  377. );
  378. this.isReconnecting = false;
  379. // Close circuit if reconnection succeeded
  380. if (this.circuitState === CircuitBreakerState.HALF_OPEN) {
  381. this.logger.log(
  382. 'Reconnection successful during HALF_OPEN, closing circuit',
  383. );
  384. this.closeCircuit();
  385. }
  386. } catch (error) {
  387. this.logger.error(
  388. `❌ RabbitMQ reconnection failed (circuitState=${this.circuitState}): ${error}`,
  389. error instanceof Error ? error.stack : undefined,
  390. );
  391. this.isReconnecting = false;
  392. // Keep circuit open on reconnection failure
  393. if (this.circuitState === CircuitBreakerState.HALF_OPEN) {
  394. this.logger.warn(
  395. '⚠️ Reconnection failed during HALF_OPEN, reopening circuit',
  396. );
  397. this.openCircuit();
  398. }
  399. }
  400. }
  401. /**
  402. * Check publisher-level idempotency: Has this message already been published?
  403. *
  404. * This prevents duplicate publishes from the publisher side within a 7-day window.
  405. * This is NOT end-to-end idempotency - consumers must perform their own duplicate
  406. * detection on the receiving end based on their business logic.
  407. *
  408. * Redis key format: rabbitmq:publish-idempotency:{messageId}
  409. * TTL: 7 days (604800 seconds)
  410. *
  411. * @param messageId - Unique message identifier (UUID)
  412. * @returns true if message was already published, false otherwise
  413. *
  414. * Note: On Redis errors, returns false (prefer duplicates over data loss)
  415. */
  416. private async checkIdempotency(messageId: string): Promise<boolean> {
  417. try {
  418. const key = `rabbitmq:publish-idempotency:${messageId}`;
  419. const exists = await this.redis.exists(key);
  420. return exists > 0;
  421. } catch (error) {
  422. this.logger.error(
  423. `Failed to check publish idempotency for ${messageId}: ${error}`,
  424. );
  425. // On Redis error, allow the message (better to have duplicate than lose data)
  426. return false;
  427. }
  428. }
  429. /**
  430. * Mark message as published (for publisher-level idempotency)
  431. *
  432. * Records that this messageId has been successfully published to RabbitMQ.
  433. * This prevents duplicate publishes from retry logic or circuit breaker recovery.
  434. *
  435. * Consumers still need to implement their own idempotency checks when processing
  436. * messages, as network issues or broker failures could cause duplicates downstream.
  437. *
  438. * Redis key format: rabbitmq:publish-idempotency:{messageId}
  439. * TTL: 7 days (604800 seconds)
  440. *
  441. * @param messageId - Unique message identifier (UUID)
  442. *
  443. * Note: Errors are logged but do not fail the publish operation
  444. */
  445. private async markAsProcessed(messageId: string): Promise<void> {
  446. try {
  447. const key = `rabbitmq:publish-idempotency:${messageId}`;
  448. await this.redis.set(key, '1', this.idempotencyTTL);
  449. } catch (error) {
  450. this.logger.error(
  451. `Failed to mark ${messageId} as published (idempotency): ${error}`,
  452. );
  453. }
  454. }
  455. /**
  456. * Store message in Redis fallback queue
  457. */
  458. private async storeInFallbackQueue(
  459. routingKey: string,
  460. payload: unknown,
  461. messageId: string,
  462. ): Promise<void> {
  463. try {
  464. const fallbackKey = `rabbitmq:fallback:${routingKey}:${messageId}`;
  465. await this.redis.setJson(fallbackKey, payload, 86400); // 24 hours TTL
  466. this.logger.warn(
  467. `Stored message ${messageId} in Redis fallback queue: ${fallbackKey}`,
  468. );
  469. } catch (error) {
  470. this.logger.error(
  471. `CRITICAL: Failed to store message ${messageId} in fallback queue: ${error}`,
  472. error instanceof Error ? error.stack : undefined,
  473. );
  474. }
  475. }
  476. /**
  477. * Send message to Dead Letter Queue
  478. */
  479. private async sendToDLQ(
  480. routingKey: string,
  481. payload: unknown,
  482. reason: string,
  483. ): Promise<void> {
  484. if (!this.channel) {
  485. this.logger.error(
  486. `Cannot send to DLQ: channel not available. Reason: ${reason}`,
  487. );
  488. return;
  489. }
  490. const dlqRoutingKey = `dlq.${routingKey}`;
  491. try {
  492. const payloadBuffer = this.toPayloadBuffer(payload);
  493. await new Promise<void>((resolve, reject) => {
  494. this.channel!.publish(
  495. this.dlqExchange,
  496. dlqRoutingKey,
  497. payloadBuffer,
  498. {
  499. persistent: true,
  500. contentType: 'application/json',
  501. headers: {
  502. 'x-death-reason': reason,
  503. 'x-death-timestamp': Date.now(),
  504. },
  505. },
  506. (err) => {
  507. if (err) {
  508. reject(err);
  509. } else {
  510. resolve();
  511. }
  512. },
  513. );
  514. });
  515. this.logger.warn(
  516. `Sent message to DLQ: exchange="${this.dlqExchange}", routingKey="${dlqRoutingKey}", queue="dlq.stats.events". Reason: ${reason}`,
  517. );
  518. } catch (error) {
  519. this.logger.error(
  520. `Failed to send message to DLQ (routingKey="${dlqRoutingKey}"): ${error}`,
  521. error instanceof Error ? error.stack : undefined,
  522. );
  523. }
  524. }
  525. /**
  526. * Retry logic with exponential backoff
  527. */
  528. private async retryPublish(
  529. publishFn: () => Promise<void>,
  530. context: string,
  531. ): Promise<void> {
  532. for (let attempt = 0; attempt < this.maxRetries; attempt++) {
  533. try {
  534. await publishFn();
  535. return; // Success
  536. } catch (error) {
  537. const isLastAttempt = attempt === this.maxRetries - 1;
  538. if (isLastAttempt) {
  539. this.logger.error(
  540. `Failed to publish after ${this.maxRetries} attempts (${context}): ${error}`,
  541. );
  542. throw error;
  543. }
  544. const delay = this.retryDelays[attempt];
  545. this.logger.warn(
  546. `Publish attempt ${attempt + 1} failed (${context}). Retrying in ${delay}ms...`,
  547. );
  548. await new Promise((resolve) => setTimeout(resolve, delay));
  549. }
  550. }
  551. }
  552. /**
  553. * Publish a user.login event.
  554. *
  555. * This is a less-critical event compared to stats events:
  556. * - Uses circuit breaker to avoid overwhelming failed RabbitMQ
  557. * - Includes retry logic (3 attempts with exponential backoff)
  558. * - Does NOT use Redis fallback or idempotency (keep it simple)
  559. * - Drops events when circuit is OPEN with clear warning logs
  560. */
  561. async publishUserLogin(event: UserLoginEventPayload): Promise<void> {
  562. // Check circuit breaker before attempting publish
  563. if (!(await this.canAttempt())) {
  564. this.logger.warn(
  565. `⚠️ Circuit breaker OPEN. Dropping user.login event for uid=${event.uid} (non-critical event, no fallback)`,
  566. );
  567. return; // Drop event silently to maintain fire-and-forget behavior
  568. }
  569. const context = `user.login uid=${event.uid}`;
  570. try {
  571. await this.retryPublish(async () => {
  572. await this.publishUserLoginCore(event);
  573. }, context);
  574. // Success!
  575. this.recordSuccess();
  576. this.logger.debug(`Published user.login event for uid=${event.uid}`);
  577. } catch (error) {
  578. // All retries failed
  579. this.recordFailure();
  580. this.logger.error(
  581. `Failed to publish user.login after ${this.maxRetries} retries for uid=${event.uid}: ${error}`,
  582. );
  583. // Don't throw - maintain fire-and-forget behavior
  584. }
  585. }
  586. /**
  587. * Core user.login publish logic (used by retry wrapper)
  588. */
  589. private async publishUserLoginCore(
  590. event: UserLoginEventPayload,
  591. ): Promise<void> {
  592. if (!this.channel) {
  593. throw new Error('RabbitMQ channel not ready');
  594. }
  595. const payloadBuffer = Buffer.from(JSON.stringify(event));
  596. return new Promise((resolve, reject) => {
  597. this.channel!.publish(
  598. this.exchange,
  599. this.routingKeyLogin,
  600. payloadBuffer,
  601. {
  602. persistent: true,
  603. contentType: 'application/json',
  604. },
  605. (err) => {
  606. if (err) {
  607. return reject(err);
  608. }
  609. resolve();
  610. },
  611. );
  612. });
  613. }
  614. /**
  615. * Publish an ads.click event.
  616. *
  617. * This is a less-critical event compared to stats events:
  618. * - Uses circuit breaker to avoid overwhelming failed RabbitMQ
  619. * - Includes retry logic (3 attempts with exponential backoff)
  620. * - Does NOT use Redis fallback or idempotency (keep it simple)
  621. * - Drops events when circuit is OPEN with clear warning logs
  622. */
  623. async publishAdsClick(event: AdsClickEventPayload): Promise<void> {
  624. // Check circuit breaker before attempting publish
  625. if (!(await this.canAttempt())) {
  626. this.logger.warn(
  627. `⚠️ Circuit breaker OPEN. Dropping ads.click event for adsId=${event.adsId} (non-critical event, no fallback)`,
  628. );
  629. return; // Drop event silently to maintain fire-and-forget behavior
  630. }
  631. const context = `ads.click adsId=${event.adsId}`;
  632. try {
  633. await this.retryPublish(async () => {
  634. await this.publishAdsClickCore(event);
  635. }, context);
  636. // Success!
  637. this.recordSuccess();
  638. this.logger.debug(`Published ads.click event for adsId=${event.adsId}`);
  639. } catch (error) {
  640. // All retries failed
  641. this.recordFailure();
  642. this.logger.error(
  643. `Failed to publish ads.click after ${this.maxRetries} retries for adsId=${event.adsId}: ${error}`,
  644. );
  645. // Don't throw - maintain fire-and-forget behavior
  646. }
  647. }
  648. /**
  649. * Core ads.click publish logic (used by retry wrapper)
  650. */
  651. private async publishAdsClickCore(
  652. event: AdsClickEventPayload,
  653. ): Promise<void> {
  654. if (!this.channel) {
  655. throw new Error('RabbitMQ channel not ready');
  656. }
  657. const payloadBuffer = Buffer.from(JSON.stringify(event));
  658. return new Promise((resolve, reject) => {
  659. this.channel!.publish(
  660. this.exchange,
  661. this.routingKeyAdsClick,
  662. payloadBuffer,
  663. {
  664. persistent: true,
  665. contentType: 'application/json',
  666. },
  667. (err) => {
  668. if (err) {
  669. return reject(err);
  670. }
  671. resolve();
  672. },
  673. );
  674. });
  675. }
  676. /**
  677. * Publish stats.ad.click event with full error handling
  678. */
  679. async publishStatsAdClick(event: StatsAdClickEventPayload): Promise<void> {
  680. return this.publishStatsEventWithFallback(
  681. this.routingKeyStatsAdClick,
  682. event,
  683. event.messageId,
  684. `stats.ad.click adId=${event.adId}`,
  685. );
  686. }
  687. /**
  688. * Publish stats.video.click event with full error handling
  689. */
  690. async publishStatsVideoClick(
  691. event: StatsVideoClickEventPayload,
  692. ): Promise<void> {
  693. return this.publishStatsEventWithFallback(
  694. this.routingKeyStatsVideoClick,
  695. event,
  696. event.messageId,
  697. `stats.video.click videoId=${event.videoId}`,
  698. );
  699. }
  700. /**
  701. * Publish stats.ad.impression event with full error handling
  702. */
  703. async publishStatsAdImpression(
  704. event: StatsAdImpressionEventPayload,
  705. ): Promise<void> {
  706. return this.publishStatsEventWithFallback(
  707. this.routingKeyStatsAdImpression,
  708. event,
  709. event.messageId,
  710. `stats.ad.impression adId=${event.adId}`,
  711. );
  712. }
  713. /**
  714. * PUBLIC API for replaying messages from Redis fallback queue
  715. * Used by RabbitmqFallbackReplayService to republish failed messages
  716. *
  717. * IMPORTANT: This method will NOT store failed replays back to the fallback queue
  718. * to prevent infinite loops. Failed replays will only go to DLQ for manual inspection.
  719. *
  720. * @param routingKey - Original routing key (e.g., 'stats.ad.click')
  721. * @param payload - Original message payload
  722. * @param messageId - Original message ID (from payload.messageId)
  723. */
  724. async replayFallbackMessage(
  725. routingKey: string,
  726. payload: unknown,
  727. messageId: string,
  728. ): Promise<void> {
  729. // Use the same internal publish logic, but with a special context
  730. // to indicate this is a replay from fallback queue
  731. return this.publishStatsEventWithFallback(
  732. routingKey,
  733. payload,
  734. messageId,
  735. `fallback-replay routingKey=${routingKey}`,
  736. );
  737. }
  738. /**
  739. * Enhanced publish with circuit breaker, retry, fallback queue, DLQ, and idempotency
  740. */
  741. private async publishStatsEventWithFallback(
  742. routingKey: string,
  743. event: unknown,
  744. messageId: string,
  745. context: string,
  746. ): Promise<void> {
  747. // 1. Check idempotency
  748. const alreadyProcessed = await this.checkIdempotency(messageId);
  749. if (alreadyProcessed) {
  750. this.logger.debug(`Skipping duplicate message ${messageId} (${context})`);
  751. return;
  752. }
  753. // 2. Check circuit breaker
  754. if (!(await this.canAttempt())) {
  755. this.logger.warn(
  756. `Circuit breaker OPEN. Storing messageId=${messageId} in fallback queue (${context}, routingKey=${routingKey})`,
  757. );
  758. await this.storeInFallbackQueue(routingKey, event, messageId);
  759. return;
  760. }
  761. // 3. Attempt to publish with retry logic
  762. try {
  763. await this.retryPublish(async () => {
  764. await this.publishStatsEvent(routingKey, event, context);
  765. }, context);
  766. // Success!
  767. this.recordSuccess();
  768. await this.markAsProcessed(messageId);
  769. this.logger.debug(`Successfully published ${messageId} (${context})`);
  770. } catch (error) {
  771. // All retries failed
  772. this.recordFailure();
  773. this.logger.error(
  774. `All retry attempts failed for messageId=${messageId} (${context}, routingKey=${routingKey}): ${error}`,
  775. );
  776. // 4. Store in fallback queue
  777. await this.storeInFallbackQueue(routingKey, event, messageId);
  778. // 5. Send to DLQ for manual inspection
  779. await this.sendToDLQ(routingKey, event, `Max retries exceeded: ${error}`);
  780. // Don't throw error - fire-and-forget pattern
  781. }
  782. }
  783. /**
  784. * Core publish logic (used by retry mechanism)
  785. */
  786. private async publishStatsEvent(
  787. routingKey: string,
  788. event: unknown,
  789. context: string,
  790. ): Promise<void> {
  791. if (!this.channel) {
  792. throw new Error('RabbitMQ channel not ready');
  793. }
  794. const payloadBuffer = this.toPayloadBuffer(event);
  795. // Safely convert BigInt timestamp to Number for RabbitMQ
  796. // Note: RabbitMQ expects milliseconds since epoch as Number
  797. const timestamp = this.safeNumberFromBigInt(
  798. nowEpochMsBigInt(),
  799. 'timestamp',
  800. );
  801. return new Promise((resolve, reject) => {
  802. this.channel!.publish(
  803. this.statsExchange,
  804. routingKey,
  805. payloadBuffer,
  806. {
  807. persistent: true,
  808. contentType: 'application/json',
  809. timestamp,
  810. expiration: this.messageTTL.toString(), // Message TTL
  811. },
  812. (err) => {
  813. if (err) {
  814. this.logger.error(
  815. `Failed to publish stats event (${context}, routingKey=${routingKey}): ${err.message}`,
  816. err.stack,
  817. );
  818. return reject(err);
  819. }
  820. this.logger.debug(
  821. `Published stats event (${context}) to ${this.statsExchange}/${routingKey}`,
  822. );
  823. resolve();
  824. },
  825. );
  826. });
  827. }
  828. /**
  829. * Convert event payload to Buffer for RabbitMQ publish
  830. *
  831. * BigInt Handling:
  832. * - JSON.stringify does not natively support BigInt (throws TypeError)
  833. * - We use a replacer function to convert BigInt → string
  834. * - This ensures timestamps like clickedAt, impressionAt serialize correctly
  835. * - Consumer side must parse these string timestamps back to appropriate numeric types
  836. *
  837. * Security Note:
  838. * - Payload size is not explicitly limited here
  839. * - RabbitMQ has max message size (default 128MB)
  840. * - Consider adding size checks if payloads grow unexpectedly large
  841. */
  842. private toPayloadBuffer(event: unknown): Buffer {
  843. const json = JSON.stringify(event, (key, value) => {
  844. // Convert BigInt to string because JSON.stringify doesn't support BigInt
  845. if (typeof value === 'bigint') {
  846. // Defensive check: warn if BigInt is suspiciously large
  847. // JavaScript Number.MAX_SAFE_INTEGER = 2^53 - 1 = 9007199254740991
  848. // Timestamps in ms are ~13 digits, so 16+ digits might indicate corruption
  849. if (value > BigInt(Number.MAX_SAFE_INTEGER) * BigInt(1000)) {
  850. this.logger.warn(
  851. `Suspiciously large BigInt in payload: key=${key}, value=${value.toString()}`,
  852. );
  853. }
  854. return value.toString();
  855. }
  856. return value;
  857. });
  858. return Buffer.from(json);
  859. }
  860. /**
  861. * Safely convert BigInt to Number with overflow detection
  862. *
  863. * JavaScript Number type uses double-precision (53-bit mantissa):
  864. * - Safe integer range: -(2^53 - 1) to (2^53 - 1)
  865. * - Timestamps in milliseconds since epoch fit easily (13-14 digits)
  866. * - But corruption or bugs could produce unsafe values
  867. *
  868. * @param value - BigInt value to convert
  869. * @param context - Context for error logging (e.g., 'timestamp', 'count')
  870. * @returns Number representation, or 0 if unsafe
  871. */
  872. private safeNumberFromBigInt(value: bigint, context: string): number {
  873. const MAX_SAFE = BigInt(Number.MAX_SAFE_INTEGER);
  874. const MIN_SAFE = BigInt(Number.MIN_SAFE_INTEGER);
  875. if (value > MAX_SAFE || value < MIN_SAFE) {
  876. this.logger.warn(
  877. `BigInt value out of safe range for ${context}: ${value.toString()}. Using 0 as fallback.`,
  878. );
  879. return 0; // Safe fallback
  880. }
  881. return Number(value);
  882. }
  883. /**
  884. * Get circuit breaker status (for monitoring)
  885. */
  886. getCircuitStatus(): {
  887. state: CircuitBreakerState;
  888. failureCount: number;
  889. successCount: number;
  890. hasConnection: boolean;
  891. hasChannel: boolean;
  892. isReconnecting: boolean;
  893. nextAttemptTime: number;
  894. } {
  895. return {
  896. state: this.circuitState,
  897. failureCount: this.failureCount,
  898. successCount: this.successCount,
  899. hasConnection: !!(
  900. this.connection && !this.connection.connection?.destroyed
  901. ),
  902. hasChannel: !!this.channel,
  903. isReconnecting: this.isReconnecting,
  904. nextAttemptTime: this.nextAttemptTime,
  905. };
  906. }
  907. }