import requests
import json
import time
import random
import os
import sys
import hashlib
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from deep_translator import GoogleTranslator

# Configuration
LARAVEL_API_URL = os.getenv('LARAVEL_API_URL', 'http://your-laravel-app.com/api/scraper/import-product')
SCRAPER_TOKEN = os.getenv('SCRAPER_TOKEN', 'your-secret-token')
# Proxy configuration
PROXY_PROVIDER = os.getenv('PROXY_PROVIDER', '')  # e.g., 'luminati', 'proxycrawl'
PROXY_API_KEY = os.getenv('PROXY_API_KEY', '')
PROXY_USERNAME = os.getenv('PROXY_USERNAME', '')
PROXY_PASSWORD = os.getenv('PROXY_PASSWORD', '')

class ProxyManager:
    """
    Manages proxy rotation with support for professional proxy services.
    Supports both static proxy lists and dynamic proxy APIs.
    """
    def __init__(self):
        self.proxies = [
            # 'http://user:pass@ip:port',
            # Add your static proxies here if not using a proxy service
        ]
        self.current_index = 0
        self.proxy_provider = PROXY_PROVIDER
        self.api_key = PROXY_API_KEY
        self.username = PROXY_USERNAME
        self.password = PROXY_PASSWORD

    def get_proxy(self):
        """Get a proxy based on configuration"""
        # If using a professional proxy service
        if self.proxy_provider and self.api_key:
            return self._get_dynamic_proxy()
        
        # Fallback to static proxy list
        if self.proxies:
            proxy = self.proxies[self.current_index]
            self.current_index = (self.current_index + 1) % len(self.proxies)
            return {"http": proxy, "https": proxy}
        
        # No proxy (direct connection)
        return None
    
    def _get_dynamic_proxy(self):
        """Get proxy from professional proxy service"""
        try:
            if self.proxy_provider.lower() == 'luminati':
                # Luminati proxy format
                proxy_url = f"http://{self.username}:{self.password}@zproxy.lum-superproxy.io:22225"
                return {"http": proxy_url, "https": proxy_url}
            elif self.proxy_provider.lower() == 'proxycrawl':
                # ProxyCrawl API
                # For ProxyCrawl, we'll add the API key to the request headers instead
                return None
            else:
                # Generic proxy service
                return None
        except Exception as e:
            print(f"Proxy service error: {e}")
            return None

class ProductScraper:
    def __init__(self):
        self.ua = UserAgent()
        self.proxy_manager = ProxyManager()
        # Initialize translator (auto detect -> french)
        self.translator = GoogleTranslator(source='auto', target='fr')

    def fetch_page(self, url):
        """
        Fetches the page content with retries and proxy rotation.
        """
        headers = {
            'User-Agent': self.ua.random,
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Referer': 'https://www.1688.com/'
        }

        max_retries = 3
        for attempt in range(max_retries):
            try:
                proxy = self.proxy_manager.get_proxy()
                
                # Handle ProxyCrawl specifically
                if PROXY_PROVIDER.lower() == 'proxycrawl' and PROXY_API_KEY:
                    proxycrawl_url = f"http://api.proxycrawl.com/?token={PROXY_API_KEY}&url={url}"
                    response = requests.get(proxycrawl_url, headers=headers, timeout=30)
                else:
                    response = requests.get(url, headers=headers, proxies=proxy, timeout=10)
                
                if response.status_code == 200:
                    return response.text
                elif response.status_code == 404:
                    print(f"Page not found: {url}")
                    return None
                else:
                    print(f"Status {response.status_code} on attempt {attempt+1}")
            
            except requests.RequestException as e:
                print(f"Network error on attempt {attempt+1}: {e}")
            
            # Random sleep before retry
            time.sleep(random.uniform(2, 5))
        
        return None

    def safe_translate(self, text):
        try:
            if not text:
                return ""
            return self.translator.translate(text)
        except Exception as e:
            print(f"Translation failed: {e}")
            return text # Return original on failure

    def parse_product(self, html, original_url):
        """
        Parses 1688 product page HTML.
        Note: 1688 HTML structure changes frequently. This is a generic example logic.
        You will need to inspect the current 1688 DOM to get precise selectors.
        """
        if not html:
            return None

        soup = BeautifulSoup(html, 'html.parser')
        
        data = {
            'external_link': original_url,
            'image_urls': []
        }

        try:
            # 1. Title
            title_tag = soup.find('h1', class_='title-text') or soup.find('div', class_='title-text')
            raw_title = title_tag.get_text(strip=True) if title_tag else "Unknown Product"
            data['name_fr'] = self.safe_translate(raw_title)

            # 2. Price
            # 1688 prices are complex ranges. Grabbing the first visible price.
            price_tag = soup.find('span', class_='price-text') or soup.find('div', class_='price-text')
            if price_tag:
                 # Clean price string (remove symbols)
                 price_str = ''.join(filter(lambda x: x.isdigit() or x == '.', price_tag.get_text()))
                 data['price'] = float(price_str) if price_str else 0.0
            else:
                 data['price'] = 0.0

            # 3. Code/SKU
            # Often found in URL or meta tags
            # Example: https://detail.1688.com/offer/123456789.html -> SKU 123456789
            import re
            match = re.search(r'offer/(\d+)\.html', original_url)
            if match:
                data['sku'] = match.group(1)
            else:
                data['sku'] = f"1688-{int(time.time())}" # Fallback
            
            # Map source_link
            data['source_link'] = original_url

            # 4. Details / Description
            # Usually generic description container
            desc_tag = soup.find('div', id='desc-lazy-load-container')
            # Extract text (or keep HTML if you want rich text, but need to clean it)
            data['description_fr'] = self.safe_translate(desc_tag.get_text(strip=True)[:500]) if desc_tag else ""

            # 5. Stock Information
            # Look for stock information on the page
            stock_tag = soup.find('span', class_='amount') or soup.find('em', class_='amount')
            if stock_tag:
                stock_text = stock_tag.get_text(strip=True)
                # Extract numeric value from stock text
                stock_numbers = re.findall(r'\d+', stock_text)
                if stock_numbers:
                    data['current_stock'] = int(stock_numbers[0])
                else:
                    data['current_stock'] = 0
            else:
                # Alternative selectors for stock
                stock_input = soup.find('input', {'id': 'productAmount'})
                if stock_input and stock_input.get('value'):
                    try:
                        data['current_stock'] = int(stock_input['value'])
                    except ValueError:
                        data['current_stock'] = 0
                else:
                    data['current_stock'] = 0

            # 6. Images
            # Main gallery images
            image_tags = soup.select('.tab-content-container li .box-img img') # Selection logic varies
            for img in image_tags:
                src = img.get('src')
                if src:
                    data['image_urls'].append(src)

            # Fallback for simpler DOMs
            if not data['image_urls']:
                 og_image = soup.find('meta', property='og:image')
                 if og_image:
                     data['image_urls'].append(og_image['content'])

        except Exception as e:
            print(f"Parsing error: {e}")
            return None

        return data

    def send_to_laravel(self, product_data):
        headers = {
            'Content-Type': 'application/json',
            'X-SCRAPER-TOKEN': SCRAPER_TOKEN
        }
        
        try:
            response = requests.post(LARAVEL_API_URL, json=product_data, headers=headers, timeout=10)
            if response.status_code == 200:
                print(f"Successfully imported: {product_data.get('sku')}")
                return True
            else:
                print(f"Failed to import: {response.text}")
                return False
        except Exception as e:
            print(f"API Error: {e}")
            return False

def run_scraper(url_list):
    scraper = ProductScraper()
    
    for url in url_list:
        print(f"Processing: {url}")
        html = scraper.fetch_page(url)
        if html:
            data = scraper.parse_product(html, url)
            if data:
                print(f"Parsed: {data.get('name_fr', 'Unknown')} - Price: {data.get('price', 0)} - Stock: {data.get('current_stock', 0)}")
                scraper.send_to_laravel(data)
            else:
                print("Failed to parse data.")
        else:
            print("Failed to fetch page.")
        
        # Respectful delay
        time.sleep(random.uniform(5, 10))

if __name__ == "__main__":
    # Example usage: passing URLs as args or hardcoded list
    # Use: python main.py https://detail.1688.com/offer/xxxx.html ...
    
    if len(sys.argv) > 1:
        target_urls = sys.argv[1:]
    else:
        # Default test list if no args
        print("No URLs provided. Using test URL.")
        target_urls = [
            "https://detail.1688.com/offer/example.html" 
            # Replace with real test URL
        ]
        
    run_scraper(target_urls)