Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # módulos de Python
- import os
- # módulos de terceros
- import requests
- from bs4 import BeautifulSoup
- def datos_contratos():
- base_url = 'https://contrataciondelestado.es'
- sesion = requests.Session()
- sesion.headers = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
- 'Accept-Encoding': 'gzip, deflate, br, zstd',
- 'Accept-Language': 'es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7',
- 'Cache-Control': 'no-cache',
- 'Connection': 'keep-alive',
- 'Host': 'contrataciondelestado.es',
- 'Pragma': 'no-cache',
- 'Sec-Fetch-Dest': 'document',
- 'Sec-Fetch-Mode': 'navigate',
- 'Sec-Fetch-Site': 'none',
- 'Sec-Fetch-User': '?1',
- 'Upgrade-Insecure-Requests': '1',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
- 'sec-ch-ua': '"Chromium";v="124", "Google Chrome";v="124", "Not-A.Brand";v="99"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"Windows"'
- }
- url = base_url + '/wps/portal/!ut/p/b0/04_Sj9CPykssy0xPLMnMz0vMAfGjzOJNXP2dnd08jAwsgo1dDIx8XEJDzTyAXHcj_YJsR0UAIk-WfA!!'
- # primera petición (GET)
- print("Petición inicial (GET)")
- req = sesion.get(url, timeout=20)
- print("Content-Location:", req.headers.get("content-location"), end="\n\n")
- guardar_y_cargar_html(req.text, "indice_0.html")
- indice = 1
- # vamos a por las siguientes páginas (POST)
- for n in range(3):
- soup = BeautifulSoup(req.text, "html.parser")
- next_url = base_url + soup.find(id="viewns_Z7_BS88AB1A0GSM10A6E365201G25_:liciRecientes").attrs.get("action")
- data = {
- "viewns_Z7_BS88AB1A0GSM10A6E365201G25_:liciRecientes": "viewns_Z7_BS88AB1A0GSM10A6E365201G25_:liciRecientes",
- "viewns_Z7_BS88AB1A0GSM10A6E365201G25_:liciRecientes:viewns_Z7_BS88AB1A0GSM10A6E365201G25_:liciRecientes:siguienteLink": "Siguiente+>>",
- "javax.faces.ViewState": f"j_id{indice}:j_id{indice+1}"
- }
- print(f'Petición POST #{n+1}: {next_url}')
- req = sesion.post(next_url, data=data, timeout=20)
- print("Content-Location:", req.headers.get("content-location"), end="\n\n")
- guardar_y_cargar_html(req.text, f'indice_{indice}.html')
- indice+= 2
- def guardar_y_cargar_html(html, nombre_archivo):
- with open(nombre_archivo, "w", encoding="utf-8") as f:
- f.write(html)
- os.system(nombre_archivo)
- # MAIN #######################################################
- if __name__ == '__main__':
- datos_contratos()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement