// Constants const CONFIG = { USER_AGENT: "Googlebot/2.1 (+http://www.google.com/bot.html)", PRODUCT_BLOCK_REGEX: /]*data-sku[^>]*>([\s\S]*?)(?=]*data-sku)/g, LINK_REGEX: /([\d,]+)/g, SEE_MORE_PRICE_REGEX: /class="hideFromPro lpPictosMsg"[^>]*>\+ d'offres à partir de (\d+),\d+€/g, BOT_DETECTION_TEXT: "Comment activer le javascript", PRICE_CHANGE_THRESHOLD: 0.4, BASE_URL: "https://www.cdiscount.com/ProductListUC.mvc/UpdateJsonPage?page=", POST_HEADER: { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Language": "en-US,en;q=0.9", "Referer": "https://www.cdiscount.com/", "x-requested-with": "XMLHttpRequest" }, }; // Utility function to parse price from string to integer function parsePrice(priceString) { return priceString ? parseFloat(priceString.replace(',', '.')) : null; } function checkBotDetection(responseText) { return responseText.includes(CONFIG.BOT_DETECTION_TEXT); } function parseProductData(productBlockMatch) { const linkMatches = productBlockMatch.match(CONFIG.LINK_REGEX) || []; const priceMatches = productBlockMatch.match(CONFIG.PRICE_REGEX) || []; if (linkMatches.length === 0 || priceMatches.length === 0) { return null; } let productUrl = linkMatches[0].match(/href="([^"]*)"/)[1]; productUrl = removeEncodedAccents(productUrl); const price = parsePrice(priceMatches[0].split('>')[1]); const seeMorePriceMatch = productBlockMatch.match(CONFIG.SEE_MORE_PRICE_REGEX); const seeMorePrice = seeMorePriceMatch ? parsePrice(seeMorePriceMatch[0].match(/(\d+),\d+/)) : null; const lowestPrice = seeMorePrice !== null && seeMorePrice < price ? seeMorePrice : price; return { productUrl, lowestPrice }; } function processProductData(productBlockMatches) { const productData = []; for (const productBlockMatch of productBlockMatches) { if (productBlockMatch.includes("{{nodeid}}") || productBlockMatch.includes("sponsorisés")) { continue; } const parsedData = parseProductData(productBlockMatch); if (parsedData) { productData.push(parsedData); } } return productData; } function sendProductPrices(productData) { const BATCH_SIZE = 500; for (let i = 0; i < productData.length; i += BATCH_SIZE) { const productBatch = productData.slice(i, i + BATCH_SIZE); const products = productBatch.map(product => ({ url: product.productUrl, price: product.lowestPrice, })); try { const response = UrlFetchApp.fetch('http://164.132.203.174:3001/products/prices', { method: 'POST', headers: { 'Content-Type': 'application/json', }, payload: JSON.stringify({ products }), muteHttpExceptions: false, }); if (response.getResponseCode() !== 200) { Logger.log('Failed to send product prices. Status code:', response.getResponseCode()); } } catch (error) { Logger.log('Error sending product prices:', error); } } } function fetchProductData(payload, page = 1) { const response = UrlFetchApp.fetch(CONFIG.BASE_URL + page, { method: 'POST', headers: CONFIG.POST_HEADER, payload: payload, followRedirects: false, muteHttpExceptions: true, }); if (checkBotDetection(response.getContentText())) { return { botBlacklisted: true }; } try { const data = JSON.parse(response.getContentText()); const html = data.productsHtml; const productBlockMatches = html.match(CONFIG.PRODUCT_BLOCK_REGEX) || []; const productData = processProductData(productBlockMatches); // Send product prices to the controller sendProductPrices(productData); return { productData, totalResultCount: data.totalResultCount }; } catch (error) { Logger.log(`Error parsing response for payload (page ${page}) ${payload}:`, error.message); return { productData: [], totalResultCount: 0 }; } } function fetchAllProductData(payload, totalPages) { var allProductData = []; for (var startPage = 1; startPage <= totalPages; startPage += 50) { var endPage = Math.min(startPage + 49, totalPages); var pagesToFetch = []; for (var page = startPage; page <= endPage; page++) { pagesToFetch.push({ url: CONFIG.BASE_URL + page, method: 'POST', headers: CONFIG.POST_HEADER, payload: payload, followRedirects: false, muteHttpExceptions: true, }); } Logger.log(`Fetching pages ${startPage} to ${endPage}`); var responses = UrlFetchApp.fetchAll(pagesToFetch); for (var i = 0; i < responses.length; i++) { var response = responses[i]; if (checkBotDetection(response.getContentText())) { Logger.log(`Bot detected for page ${startPage + i}. Skipping...`); continue; } try { var data = JSON.parse(response.getContentText()); var html = data.productsHtml; var productBlockMatches = html.match(CONFIG.PRODUCT_BLOCK_REGEX) || []; var productData = processProductData(productBlockMatches); Logger.log(`Found ${productData.length} products on page ${startPage + i}`); allProductData.push(...productData); } catch (error) { Logger.log(`Error parsing response for page ${startPage + i}:`, error.message); } } } Logger.log(`Total products found: ${allProductData.length}`); return allProductData; } function scrapeCdiscountPages() { // Check if the bot is blacklisted before fetching categories const botDetectionResponse = UrlFetchApp.fetch("https://www.cdiscount.com", { method: "GET", muteHttpExceptions: true, }); if (checkBotDetection(botDetectionResponse.getContentText())) { Logger.log("Not the good IP address. Exiting..."); return; } const response = UrlFetchApp.fetch('http://164.132.203.174:3001/categories/to-try-check', { method: "GET", muteHttpExceptions: true, }); const categories = JSON.parse(response.getContentText()); for (const category of categories) { const { url, payload } = category; Logger.log(`Processing category: ${url}`); const { totalResultCount, botBlacklisted } = fetchProductData(payload); if (botBlacklisted) { Logger.log(`Bot detected for category ${url}. Skipping...`); continue; } const totalPages = Math.ceil(totalResultCount / 50); Logger.log(`Category ${url} has ${totalPages} pages`); var productDataList = fetchAllProductData(payload, totalPages); Logger.log(`Found ${productDataList.length} products for category ${url}`); // Send all product prices to the controller if (productDataList && productDataList.length > 0) { sendProductPrices(productDataList); Logger.log(`Sent ${productDataList.length} product prices to the controller`); } else { Logger.log(`No product data found for category ${url}. Skipping sending prices.`); } UrlFetchApp.fetch(`http://164.132.203.174:3001/categories/checked?url=${encodeURIComponent(url)}`, { method: "PUT", headers: { "Content-Type": "application/json" }, payload: JSON.stringify({}), muteHttpExceptions: false, }); } } function removeEncodedAccents(url) { return url.replace(/&[#\w\d]+;/g, function(match) { var charCode = parseInt(match.slice(2, -1)); return String.fromCharCode(charCode); }); } scrapeCdiscountPages();