select+epool+socket简单源码


非堵塞io实现http请求

import socket, time 

from urllib.parse import urlparse 

from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE 


# select +  回调 + 事件循环
# 并发性高
# 单线程模式/占用内存小
selector = DefaultSelector()

stop = False
# windows下会选择pool linux下会自动选择epool
urls = []

使用select完成http请求

class Fetcher:
    def connected(self, key):
        selector.unregister(key.fd)
        self.client.send(
            "GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(self.path, self.host).encode("utf8"))
        selector.register(self.client.fileno(), EVENT_READ, self.readable)

def readable(self, key):
    d = self.client.recv(1024)
    if d:
        self.data += d
    else:
        selector.unregister(key.fd)
        data = self.data.decode("utf8")
        html_data = data.split("\r\n\r\n")[1]
        print(html_data)
        self.client.close()
        urls.remove(self.spider_url)
        if not urls:
            global stop
            stop = True

def get_url(self, url):
    self.spider_url = url
    url = urlparse(self.spider_url)
    self.host = url.netloc
    self.path = url.path
    self.data = b""
    if self.path == "":
        self.path = "/"
    # 建立socket连接
    self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    self.client.setblocking(False)
    try:
        self.client.connect((self.host, 80))  # 阻塞不会消耗cpu
    except BlockingIOError as e:
        pass

    # 注册
    selector.register(self.client.fileno(), EVENT_WRITE, self.connected)

def loop():
# 事件循环,不停的请求socket的状态并调用对应的回调函数
# 1. select 本身是不支持register模式的,
# 2. scoket 状态变化以后是由程序员来完成的,
    while not stop:
        ready = selector.select()
        for key, mask in ready:
            call_back = key.data
            call_back(key)
# 回调+事件循环+select(pool\epool)


if __name__ == "__main__":
# fetcher = Fetcher()
  start_time = time.time()
  for i in range(50):
      # url = "http://47.99.173.31/"
      url = "http://www.baidu.com"
      urls.append(url)
      fetcher = Fetcher()
      fetcher.get_url(url)
  loop()
  print(time.time() - start_time)