pyeph-data/getter.py at master · reflejar/pyeph-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys
import re
import os
import requests
import zipfile
import shutil
import pandas as pd

# TODO: Falta agregar la descarga automatica de las canastas
# https://datos.gob.ar/fi/dataset/sspm-canasta-basica-alimentaria-regiones-pais
# https://datos.gob.ar/vi/dataset/sspm-canasta-basica-total-regiones-pais

# Expresión regular para validar una URL
URL_REGEX = re.compile(
    r'^(https?://)'                 # http:// o https:// (obligatorio)
    r'(www\.)?'                     # www. (opcional)
    r'([\w.-]*indec[\w.-]*)'        # Dominio que contenga "indec"
    r'(\.[a-z]{2,})'                # Dominio de nivel superior (.com, .ar, etc.)
    r'(/[\w./-]*)*',                 # Rutas y subrutas (opcional)
    re.IGNORECASE
)

def descargar_y_procesar(url):
    # Obtener el nombre del archivo
    zip_filename = url.split("/eph/")[1]
    zip_filename = f'./tmp/{zip_filename}'
    trimestre = zip_filename.split('_')[2]
    anio = zip_filename.split('_')[4]
    # Crear carpeta ./tmp si no existe
    os.makedirs("./tmp", exist_ok=True)

    # Descargar el archivo ZIP
    response = requests.get(url)
    if response.status_code == 200:
        with open(zip_filename, "wb") as file:
            file.write(response.content)
        print(f"Archivo descargado: {zip_filename}")
    else:
        print(f"Error al descargar el archivo: {response.status_code}")
        return

    # Extraer el ZIP
    with zipfile.ZipFile(zip_filename, "r") as zip_ref:
        zip_ref.extractall('tmp')
    print(f"Archivos extraídos en la carpeta 'tmp'")

    # Selecciona la primera carpeta
    folders = [folder for folder in os.listdir("tmp") if os.path.isdir(os.path.join('tmp', folder))]

    # Si hay más de una carpeta, se toma la primera. Si no hay carpetas, se toma la carpeta tmp directamente.
    if not folders:
        folders = ['.']
    folder = os.path.join('tmp', folders[0])

    archivos_txt = [file for file in os.listdir(folder) if file.endswith(".txt")]
    for txt_file in archivos_txt:
        txt_path = os.path.join(folder, txt_file)
        if os.path.exists(txt_path):
            # Convertir TXT a DataFrame
            df = pd.read_csv(txt_path, sep=";", low_memory=False)

            # Crear nombre para el CSV
            base_name = "hogar" if "hogar" in txt_file else "individual"
            csv_output = f"base_{base_name}_{anio}T{trimestre}.csv"
            zip_output = csv_output.replace(".csv", ".zip")

            # Guardar CSV
            # Forzar columna ITF a que sea int
            if 'ITF' in df.columns:
                df['ITF'] = pd.to_numeric(df['ITF'], errors='coerce').fillna(0).astype(int)
            df.to_csv(f'./tmp/{csv_output}', index=False)
            print(f"Archivo CSV generado: {csv_output}")

            # Comprimir el CSV en un ZIP
            with zipfile.ZipFile(f'{base_name}/{zip_output}', "w", zipfile.ZIP_DEFLATED) as zip_out:
                zip_out.write(f'./tmp/{csv_output}', arcname=csv_output)
            print(f"Archivo ZIP generado: {zip_output}")

        else:
            print(f"Archivo {txt_file} no encontrado en el ZIP.")

    # Limpiar la carpeta temporal
    try:
        shutil.rmtree("./tmp")
    except:
        print("No se pudo eliminar carpeta tmp")


if len(sys.argv) > 1:
    URI = sys.argv[1]
    if URL_REGEX.match(URI):
        descargar_y_procesar(URI)
        print("Procesado con éxito")

        # Regenerar index.html
        try:
            import generate_index
            generate_index.generate_index()
        except Exception as e:
            print(f"Advertencia: No se pudo actualizar index.html: {e}")
    else:
        print(f"La URI proporcionada no es válida: {URI}")
else:
    print("No se proporcionó URI.")