如果我们在第1步将RFD添加到epoll描述符的时候使用了EPOLLET标志,那么在第5步调用epoll_wait(2)之后将有可能会挂起,因为剩余的数据还存在于文件的输入缓冲区内,而且数据发出端还在等待一个针对已经发出数据的反馈信息。只有在监视的文件句柄上发生了某个事件的时候 ET 工作模式才会汇报事件。因此在第5步的时候,调用者可能会放弃等待仍在存在于文件输入缓冲区内的剩余数据。在上面的例子中,会有一个事件产生在RFD句柄上,因为在第2步执行了一个写操作,然后,事件将会在第3步被销毁。因为第4步的读取操作没有读空文件输入缓冲区内的数据,因此我们在第5步调用 epoll_wait(2)完成后,是否挂起是不确定的。epoll工作在ET模式的时候,必须使用非阻塞套接口,以避免由于一个文件句柄的阻塞读/阻塞写操作把处理多个文件描述符的任务饿死。最好以下面的方式调用ET模式的epoll接口,在后面会介绍避免可能的缺陷。
ET (edge-triggered)是高速工作方式,只支持no-block socket,它效率要比LT更高。ET与LT的区别在于,当一个新的事件到来时,ET模式下当然可以从epoll_wait调用中获取到这个事件,可是如果这次没有把这个事件对应的套接字缓冲区处理完,在这个套接字中没有新的事件再次到来时,在ET模式下是无法再次从epoll_wait调用中获取这个事件的。而LT模式正好相反,只要一个事件对应的套接字缓冲区还有数据,就总能从epoll_wait中获取这个事件。 因此,LT模式下开发基于epoll的应用要简单些,不太容易出错。而在ET模式下事件发生时,如果没有彻底地将缓冲区数据处理完,则会导致缓冲区中的用户请求得不到响应。
171 * This structure is stored inside the "private_data" member of the file
172 * structure and represents the main data structure for the eventpoll
173 * interface.
174 */
175struct eventpoll {
176/* Protect the access to this structure */
177spinlock_t lock;
178
179/*
180 * This mutex is used to ensure that files are not removed
181 * while epoll is using them. This is held during the event
182 * collection loop, the file cleanup path, the epoll file exit
183 * code and the ctl operations.
184 */
185struct mutex mtx;
186
187/* Wait queue used by sys_epoll_wait() */
188wait_queue_head_t wq;
189
190/* Wait queue used by file->poll() */
191wait_queue_head_t poll_wait;
192
193/* List of ready file descriptors */
194struct list_head rdllist;
195
196/* RB tree root used to store monitored fd structs */
197struct rb_root rbr;//红黑树根节点,这棵树存储着所有添加到epoll中的事件,也就是这个epoll监控的事件 198 199/* 200 * This is a single linked list that chains all the "struct epitem" that 201 * happened while transferring ready events to userspace w/out 202 * holding ->lock. 203 */ 204struct epitem *ovflist; 205 206/* wakeup_source used when ep_scan_ready_list is running */ 207struct wakeup_source *ws; 208 209/* The user that created the eventpoll descriptor */ 210struct user_struct *user; 211 212struct file *file; 213 214/* used to optimize loop detection check */ 215int visited; 216struct list_head visited_list_link;//双向链表中保存着将要通过epoll_wait返回给用户的、满足条件的事件 217};
/* 130 * Each file descriptor added to the eventpoll interface will 131 * have an entry of this type linked to the "rbr" RB tree. 132 * Avoid increasing the size of this struct, there can be many thousands 133 * of these on a server and we do not want this to take another cache line. 134 */ 135struct epitem { 136/* RB tree node used to link this structure to the eventpoll RB tree */ 137struct rb_node rbn; 138 139/* List header used to link this structure to the eventpoll ready list */ 140struct list_head rdllink; 141 142/* 143 * Works together "struct eventpoll"->ovflist in keeping the 144 * single linked chain of items. 145 */ 146struct epitem *next; 147 148/* The file descriptor information this item refers to */ 149struct epoll_filefd ffd; 150 151/* Number of active wait queue attached to poll operations */ 152int nwait; 153 154/* List containing poll wait queues */ 155struct list_head pwqlist; 156 157/* The "container" of this item */ 158struct eventpoll *ep; 159 160/* List header used to link this item to the "struct file" items list */ 161struct list_head fllink; 162 163/* wakeup_source used when EPOLLWAKEUP is set */ 164struct wakeup_source __rcu *ws; 165 166/* The structure that describe the interested events and the source fd */ 167struct epoll_event event; 168};
event.data.fd = sfd; event.events = EPOLLIN | EPOLLET;//读入,边缘触发方式 s = epoll_ctl (efd, EPOLL_CTL_ADD, sfd, &event); if (s == -1) { perror ("epoll_ctl"); abort (); }
/* Buffer where events are returned */ events = calloc (MAXEVENTS, sizeof event);
/* The event loop */ while (1) { int n, i;
n = epoll_wait (efd, events, MAXEVENTS, -1); for (i = 0; i < n; i++) { if ((events[i].events & EPOLLERR) || (events[i].events & EPOLLHUP) || (!(events[i].events & EPOLLIN))) { /* An error has occured on this fd, or the socket is not ready for reading (why were we notified then?) */ fprintf (stderr, "epoll error\n"); close (events[i].data.fd); continue; }
elseif (sfd == events[i].data.fd) { /* We have a notification on the listening socket, which means one or more incoming connections. */ while (1) { struct sockaddr in_addr; socklen_t in_len; int infd; char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
in_len = sizeof in_addr; infd = accept (sfd, &in_addr, &in_len); if (infd == -1) { if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) { /* We have processed all incoming connections. */ break; } else { perror ("accept"); break; } }
if (s == 0) { printf("Accepted connection on descriptor %d " "(host=%s, port=%s)\n", infd, hbuf, sbuf); }
/* Make the incoming socket non-blocking and add it to the list of fds to monitor. */ s = make_socket_non_blocking (infd); if (s == -1) abort ();
event.data.fd = infd; event.events = EPOLLIN | EPOLLET; s = epoll_ctl (efd, EPOLL_CTL_ADD, infd, &event); if (s == -1) { perror ("epoll_ctl"); abort (); } } continue; } else { /* We have data on the fd waiting to be read. Read and display it. We must read whatever data is available completely, as we are running in edge-triggered mode and won't get a notification again for the same data. */ int done = 0;
while (1) { ssize_t count; char buf[512];
count = read (events[i].data.fd, buf, sizeof(buf)); if (count == -1) { /* If errno == EAGAIN, that means we have read all data. So go back to the main loop. */ if (errno != EAGAIN) { perror ("read"); done = 1; } break; } elseif (count == 0) { /* End of file. The remote has closed the connection. */ done = 1; break; }
/* Write the buffer to standard output */ s = write (1, buf, count); if (s == -1) { perror ("write"); abort (); } }
if (done) { printf ("Closed connection on descriptor %d\n", events[i].data.fd);
/* Closing the descriptor will make epoll remove it from the set of descriptors which are monitored. */ close (events[i].data.fd); } } } }