Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trying to write to closed file when using requests.Session #147

Open
maxyousif15 opened this issue Jun 8, 2022 · 0 comments
Open

Trying to write to closed file when using requests.Session #147

maxyousif15 opened this issue Jun 8, 2022 · 0 comments

Comments

@maxyousif15
Copy link

maxyousif15 commented Jun 8, 2022

Overview

When attempting to use requests.Session with capture_http in some kind of loop to create new WARC files, an error is raised.
However, when using requests directly without the use of a session, all works as expected.

Below is the code snippet using requests.Session alongside the exception raised

from warcio.capture_http import capture_http
from requests.sessions import Session
import requests


HEADERS = {
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36',
}


session = Session()
for i in range(3):
    fn = f'example-session-error-{i}.warc.gz'
    with capture_http(fn):
        print(f"Scraping {fn}")
        session.get('https://httpbin.org/ip')

Below is the exception raised

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-4-99e75b92ba45> in <module>
      4     with capture_http(fn):
      5         print(f"Scraping {fn}")
----> 6         session.get('https://httpbin.org/ip')

~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in get(self, url, **kwargs)
    540 
    541         kwargs.setdefault('allow_redirects', True)
--> 542         return self.request('GET', url, **kwargs)
    543 
    544     def options(self, url, **kwargs):

~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    527         }
    528         send_kwargs.update(settings)
--> 529         resp = self.send(prep, **send_kwargs)
    530 
    531         return resp

~/anaconda3/lib/python3.8/site-packages/requests/sessions.py in send(self, request, **kwargs)
    685 
    686         if not stream:
--> 687             r.content
    688 
    689         return r

~/anaconda3/lib/python3.8/site-packages/requests/models.py in content(self)
    836                 self._content = None
    837             else:
--> 838                 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
    839 
    840         self._content_consumed = True

~/anaconda3/lib/python3.8/site-packages/requests/models.py in generate()
    758             if hasattr(self.raw, 'stream'):
    759                 try:
--> 760                     for chunk in self.raw.stream(chunk_size, decode_content=True):
    761                         yield chunk
    762                 except ProtocolError as e:

~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in stream(self, amt, decode_content)
    577         else:
    578             while not is_fp_closed(self._fp):
--> 579                 data = self.read(amt=amt, decode_content=decode_content)
    580 
    581                 if data:

~/anaconda3/lib/python3.8/site-packages/urllib3/response.py in read(self, amt, decode_content, cache_content)
    520             else:
    521                 cache_content = False
--> 522                 data = self._fp.read(amt) if not fp_closed else b""
    523                 if (
    524                     amt != 0 and not data

~/anaconda3/lib/python3.8/http/client.py in read(self, amt)
    456             # Amount is given, implement using readinto
    457             b = bytearray(amt)
--> 458             n = self.readinto(b)
    459             return memoryview(b)[:n].tobytes()
    460         else:

~/anaconda3/lib/python3.8/http/client.py in readinto(self, b)
    508             self.length -= n
    509             if not self.length:
--> 510                 self._close_conn()
    511         return n
    512 

~/anaconda3/lib/python3.8/http/client.py in _close_conn(self)
    410         fp = self.fp
    411         self.fp = None
--> 412         fp.close()
    413 
    414     def close(self):

~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in close(self)
     63 
     64     def close(self):
---> 65         self.recorder.done()
     66         if self.fp:
     67             return self.fp.close()

~/anaconda3/lib/python3.8/site-packages/warcio/capture_http.py in done(self)
    196 
    197             with self.lock:
--> 198                 self.writer.write_request_response_pair(request, response)
    199         finally:
    200             self.request_out.close()

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write_request_response_pair(self, req, resp, params)
     31             req.rec_headers.add_header('WARC-Concurrent-To', resp_id)
     32 
---> 33         self._do_write_req_resp(req, resp, params)
     34 
     35     def write_record(self, record, params=None):  #pragma: no cover

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _do_write_req_resp(self, req, resp, params)
    138 
    139     def _do_write_req_resp(self, req, resp, params):
--> 140         self._write_warc_record(self.out, resp)
    141         self._write_warc_record(self.out, req)
    142 

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in _write_warc_record(self, out, record)
     89         # write record headers -- encoded as utf-8
     90         # WARC headers can be utf-8 per spec
---> 91         out.write(record.rec_headers.to_bytes(encoding='utf-8'))
     92 
     93         # write headers buffer, if any

~/anaconda3/lib/python3.8/site-packages/warcio/warcwriter.py in write(self, buff)
    120         #    buff = buff.encode('utf-8')
    121         buff = self.compressor.compress(buff)
--> 122         self.out.write(buff)
    123 
    124     def flush(self):

ValueError: write to closed file

The following code snippet intends to do the same thing as above without the use of a session, and does actually work

for i in range(3):
    fn = f'example-session-error-{i}.warc.gz'
    with capture_http(fn):
        print(f"Scraping {fn}")
        requests.get('https://httpbin.org/ip', headers=HEADERS)

Environment

Python - 3.8.5
requests - 2.27.1
warcio - 1.7.4

Any help regarding this issue would be massively appreciated.

@maxyousif15 maxyousif15 changed the title Trying to write to close file when using requests.Sessions Trying to write to closed file when using requests.Session Jun 8, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant