Download large files faster and unzip it
File Download
Downloading file from internet can be time consuming work, especially when file to be downloaded are large files. During my project work i come across a situation where i had to download some files from internet and unzip it. Using request library with normal approach it was taking hours. After lot of search i found an arcticle which helped me downloading file faster.
File Download Code
import asyncio
import os.path
import shutil
from random import randint
from time import sleep
import random
import time
import aiofiles
import aiohttp
from urllib.parse import urlparse
from tempfile import TemporaryDirectory
# File Download class
class FileDownloader:
def __init__(self, urls, filename, download_path):
self.urls = urls
self.filename = filename
self.download_path = download_path
self.fast_download = None
async def get_content_length(self, url):
async with aiohttp.ClientSession() as session:
async with session.head(url) as request:
return request.content_length
def parts_generator(self, size, start=0, part_size=50 * 1024 ** 2):
while size - start > part_size:
yield start, start + part_size
start += part_size
yield start, size
async def download(self, url, headers, save_path, attempt=0):
# attempt = 0
max_attempts = 10
timeout = aiohttp.ClientTimeout(total=6000)
try:
async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
await asyncio.sleep(1 + random.randint(0, 9))
async with session.get(url) as request:
file = await aiofiles.open(save_path, 'wb')
await file.write(await request.content.read())
except (aiohttp.ClientOSError,
aiohttp.ServerDisconnectedError):
if attempt < max_attempts:
print(f'Attempt {attempt} failed, trying again')
await asyncio.sleep(3 + random.randint(0, 9))
await self.download(url, headers, save_path, attempt + 1)
else:
raise
async def process(self, url):
filename = os.path.basename(urlparse(url).path)
# tmp_dir = TemporaryDirectory(prefix=filename, dir=os.path.abspath('.'), ignore_cleanup_errors=False)
tmp_dir = TemporaryDirectory(prefix=filename, dir=self.download_path)
size = await self.get_content_length(url)
tasks = []
file_parts = []
for number, sizes in enumerate(self.parts_generator(size)):
part_file_name = os.path.join(tmp_dir.name, f'{filename}.part{number}')
file_parts.append(part_file_name)
tasks.append(self.download(url, {'Range': f'bytes={sizes[0]}-{sizes[1]-1}'}, part_file_name))
await asyncio.gather(*tasks)
download_filename = os.path.join(self.download_path, filename)
# delete if there is existing zip file
if os.path.exists(download_filename):
os.remove(download_filename)
with open(download_filename, 'wb') as wfd:
for f in file_parts:
with open(f, 'rb') as fd:
shutil.copyfileobj(fd, wfd)
def download_file(self):
print('slow download')
download_file = os.path.join(self.download_path, self.filename)
with requests.get(self.urls[0], stream=True) as r:
r.raise_for_status()
with open(download_file, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
async def main(self, fast_download=True):
if fast_download:
await asyncio.gather(*[self.process(url) for url in self.urls])
else:
self.download_file()
if __name__ == '__main__':
url = "<file url>"
start_code = time.monotonic()
loop = asyncio.get_event_loop()
fileDownloader = FileDownloader(urls=[url], filename='file.txt.gz', download_path='/tmp')
loop.run_until_complete(fileDownloader.main())
print(f'{time.monotonic() - start_code} seconds!')
File Unzip Code
import os
import gzip
import time
# file unzip class
class FileUnzip:
def __init__(self, filename, file_path, unzip_path):
self.filename = filename
self.file_path = file_path
self.unzip_path = unzip_path
self.chunk_size = 100*1024*1024 # 100MB
def unzip(self):
zip_file = os.path.join(self.file_path, self.filename)
unzip_file = os.path.join(self.unzip_path, self.filename.replace('.gz', ''))
# delete if there is existing zip file
if os.path.exists(unzip_file):
os.remove(unzip_file)
with gzip.open(zip_file, 'rb') as f_in:
with open(unzip_file, 'wb') as f_out:
chunk = f_in.read(self.chunk_size)
while chunk:
f_out.write(chunk)
chunk = f_in.read(self.chunk_size)
if __name__ == '__main__':
start_code = time.monotonic()
fileUnzip = FileUnzip(filename='file.txt.gz', file_path='/tmp', unzip_path='/tmp')
fileUnzip.unzip()
print(f'{time.monotonic() - start_code} seconds!')
Leave a comment