// Constants
const CONFIG = {
USER_AGENT: "Googlebot/2.1 (+http://www.google.com/bot.html)",
PRODUCT_BLOCK_REGEX: /
]*data-sku[^>]*>([\s\S]*?)(?=]*data-sku)/g,
LINK_REGEX: /([\d,]+)/g,
SEE_MORE_PRICE_REGEX: /class="hideFromPro lpPictosMsg"[^>]*>\+ d'offres à partir de (\d+),\d+€/g,
BOT_DETECTION_TEXT: "Comment activer le javascript",
PRICE_CHANGE_THRESHOLD: 0.4,
BASE_URL: "https://www.cdiscount.com/ProductListUC.mvc/UpdateJsonPage?page=",
POST_HEADER: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://www.cdiscount.com/",
"x-requested-with": "XMLHttpRequest"
},
};
// Utility function to parse price from string to integer
function parsePrice(priceString) {
return priceString ? parseFloat(priceString.replace(',', '.')) : null;
}
function checkBotDetection(responseText) {
return responseText.includes(CONFIG.BOT_DETECTION_TEXT);
}
function parseProductData(productBlockMatch) {
const linkMatches = productBlockMatch.match(CONFIG.LINK_REGEX) || [];
const priceMatches = productBlockMatch.match(CONFIG.PRICE_REGEX) || [];
if (linkMatches.length === 0 || priceMatches.length === 0) {
return null;
}
let productUrl = linkMatches[0].match(/href="([^"]*)"/)[1];
productUrl = removeEncodedAccents(productUrl);
const price = parsePrice(priceMatches[0].split('>')[1]);
const seeMorePriceMatch = productBlockMatch.match(CONFIG.SEE_MORE_PRICE_REGEX);
const seeMorePrice = seeMorePriceMatch ? parsePrice(seeMorePriceMatch[0].match(/(\d+),\d+/)) : null;
const lowestPrice = seeMorePrice !== null && seeMorePrice < price ? seeMorePrice : price;
return { productUrl, lowestPrice };
}
function processProductData(productBlockMatches) {
const productData = [];
for (const productBlockMatch of productBlockMatches) {
if (productBlockMatch.includes("{{nodeid}}") || productBlockMatch.includes("sponsorisés")) {
continue;
}
const parsedData = parseProductData(productBlockMatch);
if (parsedData) {
productData.push(parsedData);
}
}
return productData;
}
function sendProductPrices(productData) {
const BATCH_SIZE = 500;
for (let i = 0; i < productData.length; i += BATCH_SIZE) {
const productBatch = productData.slice(i, i + BATCH_SIZE);
const products = productBatch.map(product => ({
url: product.productUrl,
price: product.lowestPrice,
}));
try {
const response = UrlFetchApp.fetch('http://164.132.203.174:3001/products/prices', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
payload: JSON.stringify({ products }),
muteHttpExceptions: false,
});
if (response.getResponseCode() !== 200) {
Logger.log('Failed to send product prices. Status code:', response.getResponseCode());
}
} catch (error) {
Logger.log('Error sending product prices:', error);
}
}
}
function fetchProductData(payload, page = 1) {
const response = UrlFetchApp.fetch(CONFIG.BASE_URL + page, {
method: 'POST',
headers: CONFIG.POST_HEADER,
payload: payload,
followRedirects: false,
muteHttpExceptions: true,
});
if (checkBotDetection(response.getContentText())) {
return { botBlacklisted: true };
}
try {
const data = JSON.parse(response.getContentText());
const html = data.productsHtml;
const productBlockMatches = html.match(CONFIG.PRODUCT_BLOCK_REGEX) || [];
const productData = processProductData(productBlockMatches);
// Send product prices to the controller
sendProductPrices(productData);
return { productData, totalResultCount: data.totalResultCount };
} catch (error) {
Logger.log(`Error parsing response for payload (page ${page}) ${payload}:`, error.message);
return { productData: [], totalResultCount: 0 };
}
}
function fetchAllProductData(payload, totalPages) {
var allProductData = [];
for (var startPage = 1; startPage <= totalPages; startPage += 50) {
var endPage = Math.min(startPage + 49, totalPages);
var pagesToFetch = [];
for (var page = startPage; page <= endPage; page++) {
pagesToFetch.push({
url: CONFIG.BASE_URL + page,
method: 'POST',
headers: CONFIG.POST_HEADER,
payload: payload,
followRedirects: false,
muteHttpExceptions: true,
});
}
Logger.log(`Fetching pages ${startPage} to ${endPage}`);
var responses = UrlFetchApp.fetchAll(pagesToFetch);
for (var i = 0; i < responses.length; i++) {
var response = responses[i];
if (checkBotDetection(response.getContentText())) {
Logger.log(`Bot detected for page ${startPage + i}. Skipping...`);
continue;
}
try {
var data = JSON.parse(response.getContentText());
var html = data.productsHtml;
var productBlockMatches = html.match(CONFIG.PRODUCT_BLOCK_REGEX) || [];
var productData = processProductData(productBlockMatches);
Logger.log(`Found ${productData.length} products on page ${startPage + i}`);
allProductData.push(...productData);
} catch (error) {
Logger.log(`Error parsing response for page ${startPage + i}:`, error.message);
}
}
}
Logger.log(`Total products found: ${allProductData.length}`);
return allProductData;
}
function scrapeCdiscountPages() {
// Check if the bot is blacklisted before fetching categories
const botDetectionResponse = UrlFetchApp.fetch("https://www.cdiscount.com", {
method: "GET",
muteHttpExceptions: true,
});
if (checkBotDetection(botDetectionResponse.getContentText())) {
Logger.log("Not the good IP address. Exiting...");
return;
}
const response = UrlFetchApp.fetch('http://164.132.203.174:3001/categories/to-try-check', {
method: "GET",
muteHttpExceptions: true,
});
const categories = JSON.parse(response.getContentText());
for (const category of categories) {
const { url, payload } = category;
Logger.log(`Processing category: ${url}`);
const { totalResultCount, botBlacklisted } = fetchProductData(payload);
if (botBlacklisted) {
Logger.log(`Bot detected for category ${url}. Skipping...`);
continue;
}
const totalPages = Math.ceil(totalResultCount / 50);
Logger.log(`Category ${url} has ${totalPages} pages`);
var productDataList = fetchAllProductData(payload, totalPages);
Logger.log(`Found ${productDataList.length} products for category ${url}`);
// Send all product prices to the controller
if (productDataList && productDataList.length > 0) {
sendProductPrices(productDataList);
Logger.log(`Sent ${productDataList.length} product prices to the controller`);
} else {
Logger.log(`No product data found for category ${url}. Skipping sending prices.`);
}
UrlFetchApp.fetch(`http://164.132.203.174:3001/categories/checked?url=${encodeURIComponent(url)}`, {
method: "PUT",
headers: {
"Content-Type": "application/json"
},
payload: JSON.stringify({}),
muteHttpExceptions: false,
});
}
}
function removeEncodedAccents(url) {
return url.replace(/&[#\w\d]+;/g, function(match) {
var charCode = parseInt(match.slice(2, -1));
return String.fromCharCode(charCode);
});
}
scrapeCdiscountPages();