blob: 01175421107b08a4d54c322d8aa3c81130cad76b [file] [log] [blame]
QUICHE team53f08a32019-04-15 14:47:31 -04001// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/third_party/quiche/src/epoll_server/simple_epoll_server.h"
6
7#include <errno.h> // for errno and strerror_r
8#include <stdlib.h> // for abort
9#include <unistd.h> // For read, pipe, close and write.
10
11#include <algorithm>
12#include <utility>
13
14#include "net/third_party/quiche/src/epoll_server/platform/api/epoll_bug.h"
15#include "net/third_party/quiche/src/epoll_server/platform/api/epoll_time.h"
16
17// Design notes: An efficient implementation of ready list has the following
18// desirable properties:
19//
20// A. O(1) insertion into/removal from the list in any location.
21// B. Once the callback is found by hash lookup using the fd, the lookup of
22// corresponding entry in the list is O(1).
23// C. Safe insertion into/removal from the list during list iteration. (The
24// ready list's purpose is to enable completely event driven I/O model.
25// Thus, all the interesting bits happen in the callback. It is critical
26// to not place any restriction on the API during list iteration.
27//
28// The current implementation achieves these goals with the following design:
29//
30// - The ready list is constructed as a doubly linked list to enable O(1)
31// insertion/removal (see man 3 queue).
32// - The forward and backward links are directly embedded inside the
33// CBAndEventMask struct. This enables O(1) lookup in the list for a given
34// callback. (Techincally, we could've used std::list of hash_set::iterator,
35// and keep a list::iterator in CBAndEventMask to achieve the same effect.
36// However, iterators have two problems: no way to portably invalidate them,
37// and no way to tell whether an iterator is singular or not. The only way to
38// overcome these issues is to keep bools in both places, but that throws off
39// memory alignment (up to 7 wasted bytes for each bool). The extra level of
40// indirection will also likely be less cache friendly. Direct manipulation
41// of link pointers makes it easier to retrieve the CBAndEventMask from the
42// list, easier to check whether an CBAndEventMask is in the list, uses less
43// memory (save 32 bytes/fd), and does not affect cache usage (we need to
44// read in the struct to use the callback anyway).)
45// - Embed the fd directly into CBAndEventMask and switch to using hash_set.
46// This removes the need to store hash_map::iterator in the list just so that
47// we can get both the fd and the callback.
48// - The ready list is "one shot": each entry is removed before OnEvent is
49// called. This removes the mutation-while-iterating problem.
50// - Use two lists to keep track of callbacks. The ready_list_ is the one used
51// for registration. Before iteration, the ready_list_ is swapped into the
52// tmp_list_. Once iteration is done, tmp_list_ will be empty, and
53// ready_list_ will have all the new ready fds.
54
55// The size we use for buffers passed to strerror_r
56static const int kErrorBufferSize = 256;
57
58namespace epoll_server {
59
60template <typename T>
61class AutoReset {
62 public:
63 AutoReset(T* scoped_variable, T new_value)
64 : scoped_variable_(scoped_variable),
65 original_value_(std::move(*scoped_variable)) {
66 *scoped_variable_ = std::move(new_value);
67 }
68 AutoReset(const AutoReset&) = delete;
69 AutoReset& operator=(const AutoReset&) = delete;
70
71 ~AutoReset() { *scoped_variable_ = std::move(original_value_); }
72
73 private:
74 T* scoped_variable_;
75 T original_value_;
76};
77
78// Clears the pipe and returns. Used for waking the epoll server up.
79class ReadPipeCallback : public EpollCallbackInterface {
80 public:
81 void OnEvent(int fd, EpollEvent* event) override {
82 DCHECK(event->in_events == EPOLLIN);
83 int data;
84 int data_read = 1;
85 // Read until the pipe is empty.
86 while (data_read > 0) {
87 data_read = read(fd, &data, sizeof(data));
88 }
89 }
90 void OnShutdown(SimpleEpollServer* eps, int fd) override {}
91 void OnRegistration(SimpleEpollServer*, int, int) override {}
92 void OnModification(int, int) override {} // COV_NF_LINE
93 void OnUnregistration(int, bool) override {} // COV_NF_LINE
94 std::string Name() const override { return "ReadPipeCallback"; }
95};
96
97////////////////////////////////////////////////////////////////////////////////
98////////////////////////////////////////////////////////////////////////////////
99
100SimpleEpollServer::SimpleEpollServer()
101 : epoll_fd_(epoll_create(1024)),
102 timeout_in_us_(0),
103 recorded_now_in_us_(0),
104 ready_list_size_(0),
105 wake_cb_(new ReadPipeCallback),
106 read_fd_(-1),
107 write_fd_(-1),
108 in_wait_for_events_and_execute_callbacks_(false),
109 in_shutdown_(false),
110 last_delay_in_usec_(0) {
111 // ensure that the epoll_fd_ is valid.
112 CHECK_NE(epoll_fd_, -1);
113 LIST_INIT(&ready_list_);
114 LIST_INIT(&tmp_list_);
115
116 int pipe_fds[2];
117 if (pipe(pipe_fds) < 0) {
118 // Unfortunately, it is impossible to test any such initialization in
119 // a constructor (as virtual methods do not yet work).
120 // This -could- be solved by moving initialization to an outside
121 // call...
122 int saved_errno = errno;
123 char buf[kErrorBufferSize];
124 EPOLL_LOG(FATAL) << "Error " << saved_errno << " in pipe(): "
125 << strerror_r(saved_errno, buf, sizeof(buf));
126 }
127 read_fd_ = pipe_fds[0];
128 write_fd_ = pipe_fds[1];
129 RegisterFD(read_fd_, wake_cb_.get(), EPOLLIN);
130}
131
132void SimpleEpollServer::CleanupFDToCBMap() {
133 auto cb_iter = cb_map_.begin();
134 while (cb_iter != cb_map_.end()) {
135 int fd = cb_iter->fd;
136 CB* cb = cb_iter->cb;
137
138 cb_iter->in_use = true;
139 if (cb) {
140 cb->OnShutdown(this, fd);
141 }
142
143 cb_map_.erase(cb_iter);
144 cb_iter = cb_map_.begin();
145 }
146}
147
148void SimpleEpollServer::CleanupTimeToAlarmCBMap() {
149 TimeToAlarmCBMap::iterator erase_it;
150
151 // Call OnShutdown() on alarms. Note that the structure of the loop
152 // is similar to the structure of loop in the function HandleAlarms()
153 for (auto i = alarm_map_.begin(); i != alarm_map_.end();) {
154 // Note that OnShutdown() can call UnregisterAlarm() on
155 // other iterators. OnShutdown() should not call UnregisterAlarm()
156 // on self because by definition the iterator is not valid any more.
157 i->second->OnShutdown(this);
158 erase_it = i;
159 ++i;
160 alarm_map_.erase(erase_it);
161 }
162}
163
164SimpleEpollServer::~SimpleEpollServer() {
165 DCHECK_EQ(in_shutdown_, false);
166 in_shutdown_ = true;
167#ifdef EPOLL_SERVER_EVENT_TRACING
168 EPOLL_LOG(INFO) << "\n" << event_recorder_;
169#endif
170 EPOLL_VLOG(2) << "Shutting down epoll server ";
171 CleanupFDToCBMap();
172
173 LIST_INIT(&ready_list_);
174 LIST_INIT(&tmp_list_);
175
176 CleanupTimeToAlarmCBMap();
177
178 close(read_fd_);
179 close(write_fd_);
180 close(epoll_fd_);
181}
182
183// Whether a CBAandEventMask is on the ready list is determined by a non-NULL
184// le_prev pointer (le_next being NULL indicates end of list).
185inline void SimpleEpollServer::AddToReadyList(CBAndEventMask* cb_and_mask) {
186 if (cb_and_mask->entry.le_prev == NULL) {
187 LIST_INSERT_HEAD(&ready_list_, cb_and_mask, entry);
188 ++ready_list_size_;
189 }
190}
191
192inline void SimpleEpollServer::RemoveFromReadyList(
193 const CBAndEventMask& cb_and_mask) {
194 if (cb_and_mask.entry.le_prev != NULL) {
195 LIST_REMOVE(&cb_and_mask, entry);
196 // Clean up all the ready list states. Don't bother with the other fields
197 // as they are initialized when the CBAandEventMask is added to the ready
198 // list. This saves a few cycles in the inner loop.
199 cb_and_mask.entry.le_prev = NULL;
200 --ready_list_size_;
201 if (ready_list_size_ == 0) {
202 DCHECK(ready_list_.lh_first == NULL);
203 DCHECK(tmp_list_.lh_first == NULL);
204 }
205 }
206}
207
208void SimpleEpollServer::RegisterFD(int fd, CB* cb, int event_mask) {
209 CHECK(cb);
210 EPOLL_VLOG(3) << "RegisterFD fd=" << fd << " event_mask=" << event_mask;
211 auto fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
212 if (cb_map_.end() != fd_i) {
213 // do we just abort, or do we just unregister the other callback?
214 // for now, lets just unregister the other callback.
215
216 // unregister any callback that may already be registered for this FD.
217 CB* other_cb = fd_i->cb;
218 if (other_cb) {
219 // Must remove from the ready list before erasing.
220 RemoveFromReadyList(*fd_i);
221 other_cb->OnUnregistration(fd, true);
222 ModFD(fd, event_mask);
223 } else {
224 // already unregistered, so just recycle the node.
225 AddFD(fd, event_mask);
226 }
227 fd_i->cb = cb;
228 fd_i->event_mask = event_mask;
229 fd_i->events_to_fake = 0;
230 } else {
231 AddFD(fd, event_mask);
232 cb_map_.insert(CBAndEventMask(cb, event_mask, fd));
233 }
234
235 // set the FD to be non-blocking.
236 SetNonblocking(fd);
237
238 cb->OnRegistration(this, fd, event_mask);
239}
240
241void SimpleEpollServer::SetNonblocking(int fd) {
242 int flags = fcntl(fd, F_GETFL, 0);
243 if (flags == -1) {
244 int saved_errno = errno;
245 char buf[kErrorBufferSize];
246 EPOLL_LOG(FATAL) << "Error " << saved_errno << " doing fcntl(" << fd
247 << ", F_GETFL, 0): "
248 << strerror_r(saved_errno, buf, sizeof(buf));
249 }
250 if (!(flags & O_NONBLOCK)) {
251 int saved_flags = flags;
252 flags = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
253 if (flags == -1) {
254 // bad.
255 int saved_errno = errno;
256 char buf[kErrorBufferSize];
257 EPOLL_LOG(FATAL) << "Error " << saved_errno << " doing fcntl(" << fd
258 << ", F_SETFL, " << saved_flags
259 << "): " << strerror_r(saved_errno, buf, sizeof(buf));
260 }
261 }
262}
263
264int SimpleEpollServer::epoll_wait_impl(int epfd, struct epoll_event* events,
265 int max_events, int timeout_in_ms) {
266 return epoll_wait(epfd, events, max_events, timeout_in_ms);
267}
268
269void SimpleEpollServer::RegisterFDForWrite(int fd, CB* cb) {
270 RegisterFD(fd, cb, EPOLLOUT);
271}
272
273void SimpleEpollServer::RegisterFDForReadWrite(int fd, CB* cb) {
274 RegisterFD(fd, cb, EPOLLIN | EPOLLOUT);
275}
276
277void SimpleEpollServer::RegisterFDForRead(int fd, CB* cb) {
278 RegisterFD(fd, cb, EPOLLIN);
279}
280
281void SimpleEpollServer::UnregisterFD(int fd) {
282 auto fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
283 if (cb_map_.end() == fd_i || fd_i->cb == NULL) {
284 // Doesn't exist in server, or has gone through UnregisterFD once and still
285 // inside the callchain of OnEvent.
286 return;
287 }
288#ifdef EPOLL_SERVER_EVENT_TRACING
289 event_recorder_.RecordUnregistration(fd);
290#endif
291 CB* cb = fd_i->cb;
292 // Since the links are embedded within the struct, we must remove it from the
293 // list before erasing it from the hash_set.
294 RemoveFromReadyList(*fd_i);
295 DelFD(fd);
296 cb->OnUnregistration(fd, false);
297 // fd_i->cb is NULL if that fd is unregistered inside the callchain of
298 // OnEvent. Since the SimpleEpollServer needs a valid CBAndEventMask after
299 // OnEvent returns in order to add it to the ready list, we cannot have
300 // UnregisterFD erase the entry if it is in use. Thus, a NULL fd_i->cb is used
301 // as a condition that tells the SimpleEpollServer that this entry is unused
302 // at a later point.
303 if (!fd_i->in_use) {
304 cb_map_.erase(fd_i);
305 } else {
306 // Remove all trace of the registration, and just keep the node alive long
307 // enough so the code that calls OnEvent doesn't have to worry about
308 // figuring out whether the CBAndEventMask is valid or not.
309 fd_i->cb = NULL;
310 fd_i->event_mask = 0;
311 fd_i->events_to_fake = 0;
312 }
313}
314
315void SimpleEpollServer::ModifyCallback(int fd, int event_mask) {
316 ModifyFD(fd, ~0, event_mask);
317}
318
319void SimpleEpollServer::StopRead(int fd) { ModifyFD(fd, EPOLLIN, 0); }
320
321void SimpleEpollServer::StartRead(int fd) { ModifyFD(fd, 0, EPOLLIN); }
322
323void SimpleEpollServer::StopWrite(int fd) { ModifyFD(fd, EPOLLOUT, 0); }
324
325void SimpleEpollServer::StartWrite(int fd) { ModifyFD(fd, 0, EPOLLOUT); }
326
327void SimpleEpollServer::HandleEvent(int fd, int event_mask) {
328#ifdef EPOLL_SERVER_EVENT_TRACING
329 event_recorder_.RecordEpollEvent(fd, event_mask);
330#endif
331 auto fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
332 if (fd_i == cb_map_.end() || fd_i->cb == NULL) {
333 // Ignore the event.
334 // This could occur if epoll() returns a set of events, and
335 // while processing event A (earlier) we removed the callback
336 // for event B (and are now processing event B).
337 return;
338 }
339 fd_i->events_asserted = event_mask;
340 CBAndEventMask* cb_and_mask = const_cast<CBAndEventMask*>(&*fd_i);
341 AddToReadyList(cb_and_mask);
342}
343
344void SimpleEpollServer::WaitForEventsAndExecuteCallbacks() {
345 if (in_wait_for_events_and_execute_callbacks_) {
346 EPOLL_LOG(DFATAL) << "Attempting to call WaitForEventsAndExecuteCallbacks"
347 " when an ancestor to the current function is already"
348 " WaitForEventsAndExecuteCallbacks!";
349 // The line below is actually tested, but in coverage mode,
350 // we never see it.
351 return; // COV_NF_LINE
352 }
353 AutoReset<bool> recursion_guard(&in_wait_for_events_and_execute_callbacks_,
354 true);
355 if (alarm_map_.empty()) {
356 // no alarms, this is business as usual.
357 WaitForEventsAndCallHandleEvents(timeout_in_us_, events_, events_size_);
358 recorded_now_in_us_ = 0;
359 return;
360 }
361
362 // store the 'now'. If we recomputed 'now' every iteration
363 // down below, then we might never exit that loop-- any
364 // long-running alarms might install other long-running
365 // alarms, etc. By storing it here now, we ensure that
366 // a more reasonable amount of work is done here.
367 int64_t now_in_us = NowInUsec();
368
369 // Get the first timeout from the alarm_map where it is
370 // stored in absolute time.
371 int64_t next_alarm_time_in_us = alarm_map_.begin()->first;
372 EPOLL_VLOG(4) << "next_alarm_time = " << next_alarm_time_in_us
373 << " now = " << now_in_us
374 << " timeout_in_us = " << timeout_in_us_;
375
376 int64_t wait_time_in_us;
377 int64_t alarm_timeout_in_us = next_alarm_time_in_us - now_in_us;
378
379 // If the next alarm is sooner than the default timeout, or if there is no
380 // timeout (timeout_in_us_ == -1), wake up when the alarm should fire.
381 // Otherwise use the default timeout.
382 if (alarm_timeout_in_us < timeout_in_us_ || timeout_in_us_ < 0) {
383 wait_time_in_us = std::max(alarm_timeout_in_us, static_cast<int64_t>(0));
384 } else {
385 wait_time_in_us = timeout_in_us_;
386 }
387
388 EPOLL_VLOG(4) << "wait_time_in_us = " << wait_time_in_us;
389
390 // wait for events.
391
392 WaitForEventsAndCallHandleEvents(wait_time_in_us, events_, events_size_);
393 CallAndReregisterAlarmEvents();
394 recorded_now_in_us_ = 0;
395}
396
397void SimpleEpollServer::SetFDReady(int fd, int events_to_fake) {
398 auto fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
399 if (cb_map_.end() != fd_i && fd_i->cb != NULL) {
400 // This const_cast is necessary for LIST_HEAD_INSERT to work. Declaring
401 // entry mutable is insufficient because LIST_HEAD_INSERT assigns the
402 // forward pointer of the list head to the current cb_and_mask, and the
403 // compiler complains that it can't assign a const T* to a T*.
404 CBAndEventMask* cb_and_mask = const_cast<CBAndEventMask*>(&*fd_i);
405 // Note that there is no clearly correct behavior here when
406 // cb_and_mask->events_to_fake != 0 and this function is called.
407 // Of the two operations:
408 // cb_and_mask->events_to_fake = events_to_fake
409 // cb_and_mask->events_to_fake |= events_to_fake
410 // the first was picked because it discourages users from calling
411 // SetFDReady repeatedly to build up the correct event set as it is more
412 // efficient to call SetFDReady once with the correct, final mask.
413 cb_and_mask->events_to_fake = events_to_fake;
414 AddToReadyList(cb_and_mask);
415 }
416}
417
418void SimpleEpollServer::SetFDNotReady(int fd) {
419 auto fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
420 if (cb_map_.end() != fd_i) {
421 RemoveFromReadyList(*fd_i);
422 }
423}
424
425bool SimpleEpollServer::IsFDReady(int fd) const {
426 auto fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
427 return (cb_map_.end() != fd_i && fd_i->cb != NULL &&
428 fd_i->entry.le_prev != NULL);
429}
430
431void SimpleEpollServer::VerifyReadyList() const {
432 int count = 0;
433 CBAndEventMask* cur = ready_list_.lh_first;
434 for (; cur; cur = cur->entry.le_next) {
435 ++count;
436 }
437 for (cur = tmp_list_.lh_first; cur; cur = cur->entry.le_next) {
438 ++count;
439 }
440 CHECK_EQ(ready_list_size_, count) << "Ready list size does not match count";
441}
442
443void SimpleEpollServer::RegisterAlarm(int64_t timeout_time_in_us, AlarmCB* ac) {
444 EPOLL_VLOG(4) << "RegisteringAlarm " << ac << " at : " << timeout_time_in_us;
445 CHECK(ac);
446 if (all_alarms_.find(ac) != all_alarms_.end()) {
447 EPOLL_BUG << "Alarm already exists";
448 }
449
450 auto alarm_iter = alarm_map_.insert(std::make_pair(timeout_time_in_us, ac));
451
452 all_alarms_.insert(ac);
453 // Pass the iterator to the EpollAlarmCallbackInterface.
454 ac->OnRegistration(alarm_iter, this);
455}
456
457// Unregister a specific alarm callback: iterator_token must be a
458// valid iterator. The caller must ensure the validity of the iterator.
459void SimpleEpollServer::UnregisterAlarm(const AlarmRegToken& iterator_token) {
460 AlarmCB* cb = iterator_token->second;
461 EPOLL_VLOG(4) << "UnregisteringAlarm " << cb;
462 alarm_map_.erase(iterator_token);
463 all_alarms_.erase(cb);
464 cb->OnUnregistration();
465}
466
467SimpleEpollServer::AlarmRegToken SimpleEpollServer::ReregisterAlarm(
468 SimpleEpollServer::AlarmRegToken iterator_token,
469 int64_t timeout_time_in_us) {
470 AlarmCB* cb = iterator_token->second;
471 alarm_map_.erase(iterator_token);
472 return alarm_map_.emplace(timeout_time_in_us, cb);
473}
474
475int SimpleEpollServer::NumFDsRegistered() const {
476 DCHECK_GE(cb_map_.size(), 1u);
477 // Omit the internal FD (read_fd_)
478 return cb_map_.size() - 1;
479}
480
481void SimpleEpollServer::Wake() {
482 char data = 'd'; // 'd' is for data. It's good enough for me.
483 int rv = write(write_fd_, &data, 1);
484 DCHECK_EQ(rv, 1);
485}
486
487int64_t SimpleEpollServer::NowInUsec() const { return WallTimeNowInUsec(); }
488
489int64_t SimpleEpollServer::ApproximateNowInUsec() const {
490 if (recorded_now_in_us_ != 0) {
491 return recorded_now_in_us_;
492 }
493 return this->NowInUsec();
494}
495
496std::string SimpleEpollServer::EventMaskToString(int event_mask) {
497 std::string s;
498 if (event_mask & EPOLLIN) s += "EPOLLIN ";
499 if (event_mask & EPOLLPRI) s += "EPOLLPRI ";
500 if (event_mask & EPOLLOUT) s += "EPOLLOUT ";
501 if (event_mask & EPOLLRDNORM) s += "EPOLLRDNORM ";
502 if (event_mask & EPOLLRDBAND) s += "EPOLLRDBAND ";
503 if (event_mask & EPOLLWRNORM) s += "EPOLLWRNORM ";
504 if (event_mask & EPOLLWRBAND) s += "EPOLLWRBAND ";
505 if (event_mask & EPOLLMSG) s += "EPOLLMSG ";
506 if (event_mask & EPOLLERR) s += "EPOLLERR ";
507 if (event_mask & EPOLLHUP) s += "EPOLLHUP ";
508 if (event_mask & EPOLLONESHOT) s += "EPOLLONESHOT ";
509 if (event_mask & EPOLLET) s += "EPOLLET ";
510 return s;
511}
512
513void SimpleEpollServer::LogStateOnCrash() {
514 EPOLL_LOG(ERROR)
515 << "-------------------Epoll Server-------------------------";
516 EPOLL_LOG(ERROR) << "Epoll server " << this << " polling on fd " << epoll_fd_;
517 EPOLL_LOG(ERROR) << "timeout_in_us_: " << timeout_in_us_;
518
519 // Log sessions with alarms.
520 EPOLL_LOG(ERROR) << alarm_map_.size() << " alarms registered.";
521 for (auto it = alarm_map_.begin(); it != alarm_map_.end(); ++it) {
522 const bool skipped =
523 alarms_reregistered_and_should_be_skipped_.find(it->second) !=
524 alarms_reregistered_and_should_be_skipped_.end();
525 EPOLL_LOG(ERROR) << "Alarm " << it->second << " registered at time "
526 << it->first << " and should be skipped = " << skipped;
527 }
528
529 EPOLL_LOG(ERROR) << cb_map_.size() << " fd callbacks registered.";
530 for (auto it = cb_map_.begin(); it != cb_map_.end(); ++it) {
531 EPOLL_LOG(ERROR) << "fd: " << it->fd << " with mask " << it->event_mask
532 << " registered with cb: " << it->cb;
533 }
534 EPOLL_LOG(ERROR)
535 << "-------------------/Epoll Server------------------------";
536}
537
538////////////////////////////////////////////////////////////////////////////////
539////////////////////////////////////////////////////////////////////////////////
540
541void SimpleEpollServer::DelFD(int fd) const {
542 struct epoll_event ee;
543 memset(&ee, 0, sizeof(ee));
544#ifdef EPOLL_SERVER_EVENT_TRACING
545 event_recorder_.RecordFDMaskEvent(fd, 0, "DelFD");
546#endif
547 if (epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, fd, &ee)) {
548 int saved_errno = errno;
549 char buf[kErrorBufferSize];
550 EPOLL_LOG(FATAL) << "Epoll set removal error for fd " << fd << ": "
551 << strerror_r(saved_errno, buf, sizeof(buf));
552 }
553}
554
555////////////////////////////////////////
556
557void SimpleEpollServer::AddFD(int fd, int event_mask) const {
558 struct epoll_event ee;
559 memset(&ee, 0, sizeof(ee));
560 ee.events = event_mask | EPOLLERR | EPOLLHUP;
561 ee.data.fd = fd;
562#ifdef EPOLL_SERVER_EVENT_TRACING
563 event_recorder_.RecordFDMaskEvent(fd, ee.events, "AddFD");
564#endif
565 if (epoll_ctl(epoll_fd_, EPOLL_CTL_ADD, fd, &ee)) {
566 int saved_errno = errno;
567 char buf[kErrorBufferSize];
568 EPOLL_LOG(FATAL) << "Epoll set insertion error for fd " << fd << ": "
569 << strerror_r(saved_errno, buf, sizeof(buf));
570 }
571}
572
573////////////////////////////////////////
574
575void SimpleEpollServer::ModFD(int fd, int event_mask) const {
576 struct epoll_event ee;
577 memset(&ee, 0, sizeof(ee));
578 ee.events = event_mask | EPOLLERR | EPOLLHUP;
579 ee.data.fd = fd;
580#ifdef EPOLL_SERVER_EVENT_TRACING
581 event_recorder_.RecordFDMaskEvent(fd, ee.events, "ModFD");
582#endif
583 EPOLL_VLOG(3) << "modifying fd= " << fd << " "
584 << EventMaskToString(ee.events);
585 if (epoll_ctl(epoll_fd_, EPOLL_CTL_MOD, fd, &ee)) {
586 int saved_errno = errno;
587 char buf[kErrorBufferSize];
588 EPOLL_LOG(FATAL) << "Epoll set modification error for fd " << fd << ": "
589 << strerror_r(saved_errno, buf, sizeof(buf));
590 }
591}
592
593////////////////////////////////////////
594
595void SimpleEpollServer::ModifyFD(int fd, int remove_event, int add_event) {
596 auto fd_i = cb_map_.find(CBAndEventMask(NULL, 0, fd));
597 if (cb_map_.end() == fd_i) {
598 EPOLL_VLOG(2) << "Didn't find the fd " << fd << "in internal structures";
599 return;
600 }
601
602 if (fd_i->cb != NULL) {
603 int& event_mask = fd_i->event_mask;
604 EPOLL_VLOG(3) << "fd= " << fd
605 << " event_mask before: " << EventMaskToString(event_mask);
606 event_mask &= ~remove_event;
607 event_mask |= add_event;
608
609 EPOLL_VLOG(3) << " event_mask after: " << EventMaskToString(event_mask);
610
611 ModFD(fd, event_mask);
612
613 fd_i->cb->OnModification(fd, event_mask);
614 }
615}
616
617void SimpleEpollServer::WaitForEventsAndCallHandleEvents(
618 int64_t timeout_in_us, struct epoll_event events[], int events_size) {
619 if (timeout_in_us == 0 || ready_list_.lh_first != NULL) {
620 // If ready list is not empty, then don't sleep at all.
621 timeout_in_us = 0;
622 } else if (timeout_in_us < 0) {
623 EPOLL_LOG(INFO) << "Negative epoll timeout: " << timeout_in_us
624 << "us; epoll will wait forever for events.";
625 // If timeout_in_us is < 0 we are supposed to Wait forever. This means we
626 // should set timeout_in_us to -1000 so we will
627 // Wait(-1000/1000) == Wait(-1) == Wait forever.
628 timeout_in_us = -1000;
629 } else {
630 // If timeout is specified, and the ready list is empty.
631 if (timeout_in_us < 1000) {
632 timeout_in_us = 1000;
633 }
634 }
635 const int timeout_in_ms = timeout_in_us / 1000;
636 int64_t expected_wakeup_us = NowInUsec() + timeout_in_us;
637
638 int nfds = epoll_wait_impl(epoll_fd_, events, events_size, timeout_in_ms);
639 EPOLL_VLOG(3) << "nfds=" << nfds;
640
641#ifdef EPOLL_SERVER_EVENT_TRACING
642 event_recorder_.RecordEpollWaitEvent(timeout_in_ms, nfds);
643#endif
644
645 // If you're wondering why the NowInUsec() is recorded here, the answer is
646 // simple: If we did it before the epoll_wait_impl, then the max error for
647 // the ApproximateNowInUs() call would be as large as the maximum length of
648 // epoll_wait, which can be arbitrarily long. Since this would make
649 // ApproximateNowInUs() worthless, we instead record the time -after- we've
650 // done epoll_wait, which guarantees that the maximum error is the amount of
651 // time it takes to process all the events generated by epoll_wait.
652 recorded_now_in_us_ = NowInUsec();
653
654 if (timeout_in_us > 0) {
655 int64_t delta = NowInUsec() - expected_wakeup_us;
656 last_delay_in_usec_ = delta > 0 ? delta : 0;
657 } else {
658 // timeout_in_us < 0 means we waited forever until an event;
659 // timeout_in_us == 0 means there was no kernel delay to track.
660 last_delay_in_usec_ = 0;
661 }
662
663 if (nfds > 0) {
664 for (int i = 0; i < nfds; ++i) {
665 int event_mask = events[i].events;
666 int fd = events[i].data.fd;
667 HandleEvent(fd, event_mask);
668 }
669 } else if (nfds < 0) {
670 // Catch interrupted syscall and just ignore it and move on.
671 if (errno != EINTR && errno != 0) {
672 int saved_errno = errno;
673 char buf[kErrorBufferSize];
674 EPOLL_LOG(FATAL) << "Error " << saved_errno << " in epoll_wait: "
675 << strerror_r(saved_errno, buf, sizeof(buf));
676 }
677 }
678
679 // Now run through the ready list.
680 if (ready_list_.lh_first) {
681 CallReadyListCallbacks();
682 }
683}
684
685void SimpleEpollServer::CallReadyListCallbacks() {
686 // Check pre-conditions.
687 DCHECK(tmp_list_.lh_first == NULL);
688 // Swap out the ready_list_ into the tmp_list_ before traversing the list to
689 // enable SetFDReady() to just push new items into the ready_list_.
690 std::swap(ready_list_.lh_first, tmp_list_.lh_first);
691 if (tmp_list_.lh_first) {
692 tmp_list_.lh_first->entry.le_prev = &tmp_list_.lh_first;
693 EpollEvent event(0);
694 while (tmp_list_.lh_first != NULL) {
695 DCHECK_GT(ready_list_size_, 0);
696 CBAndEventMask* cb_and_mask = tmp_list_.lh_first;
697 RemoveFromReadyList(*cb_and_mask);
698
699 event.out_ready_mask = 0;
700 event.in_events =
701 cb_and_mask->events_asserted | cb_and_mask->events_to_fake;
702 // TODO(fenix): get rid of the two separate fields in cb_and_mask.
703 cb_and_mask->events_asserted = 0;
704 cb_and_mask->events_to_fake = 0;
705 {
706 // OnEvent() may call UnRegister, so we set in_use, here. Any
707 // UnRegister call will now simply set the cb to NULL instead of
708 // invalidating the cb_and_mask object (by deleting the object in the
709 // map to which cb_and_mask refers)
710 AutoReset<bool> in_use_guard(&(cb_and_mask->in_use), true);
711 cb_and_mask->cb->OnEvent(cb_and_mask->fd, &event);
712 }
713
714 // Since OnEvent may have called UnregisterFD, we must check here that
715 // the callback is still valid. If it isn't, then UnregisterFD *was*
716 // called, and we should now get rid of the object.
717 if (cb_and_mask->cb == NULL) {
718 cb_map_.erase(*cb_and_mask);
719 } else if (event.out_ready_mask != 0) {
720 cb_and_mask->events_to_fake = event.out_ready_mask;
721 AddToReadyList(cb_and_mask);
722 }
723 }
724 }
725 DCHECK(tmp_list_.lh_first == NULL);
726}
727
728void SimpleEpollServer::CallAndReregisterAlarmEvents() {
729 int64_t now_in_us = recorded_now_in_us_;
730 DCHECK_NE(0, recorded_now_in_us_);
731
732 TimeToAlarmCBMap::iterator erase_it;
733
734 // execute alarms.
735 for (auto i = alarm_map_.begin(); i != alarm_map_.end();) {
736 if (i->first > now_in_us) {
737 break;
738 }
739 AlarmCB* cb = i->second;
740 // Execute the OnAlarm() only if we did not register
741 // it in this loop itself.
742 const bool added_in_this_round =
743 alarms_reregistered_and_should_be_skipped_.find(cb) !=
744 alarms_reregistered_and_should_be_skipped_.end();
745 if (added_in_this_round) {
746 ++i;
747 continue;
748 }
749 all_alarms_.erase(cb);
750 const int64_t new_timeout_time_in_us = cb->OnAlarm();
751
752 erase_it = i;
753 ++i;
754 alarm_map_.erase(erase_it);
755
756 if (new_timeout_time_in_us > 0) {
757 // We add to hash_set only if the new timeout is <= now_in_us.
758 // if timeout is > now_in_us then we have no fear that this alarm
759 // can be reexecuted in this loop, and hence we do not need to
760 // worry about a recursive loop.
761 EPOLL_DVLOG(3) << "Reregistering alarm "
762 << " " << cb << " " << new_timeout_time_in_us << " "
763 << now_in_us;
764 if (new_timeout_time_in_us <= now_in_us) {
765 alarms_reregistered_and_should_be_skipped_.insert(cb);
766 }
767 RegisterAlarm(new_timeout_time_in_us, cb);
768 }
769 }
770 alarms_reregistered_and_should_be_skipped_.clear();
771}
772
773EpollAlarm::EpollAlarm() : eps_(NULL), registered_(false) {}
774
775EpollAlarm::~EpollAlarm() { UnregisterIfRegistered(); }
776
777int64_t EpollAlarm::OnAlarm() {
778 registered_ = false;
779 return 0;
780}
781
782void EpollAlarm::OnRegistration(const SimpleEpollServer::AlarmRegToken& token,
783 SimpleEpollServer* eps) {
784 DCHECK_EQ(false, registered_);
785
786 token_ = token;
787 eps_ = eps;
788 registered_ = true;
789}
790
791void EpollAlarm::OnUnregistration() { registered_ = false; }
792
793void EpollAlarm::OnShutdown(SimpleEpollServer* eps) {
794 registered_ = false;
795 eps_ = NULL;
796}
797
798// If the alarm was registered, unregister it.
799void EpollAlarm::UnregisterIfRegistered() {
800 if (!registered_) {
801 return;
802 }
803
804 eps_->UnregisterAlarm(token_);
805}
806
807void EpollAlarm::ReregisterAlarm(int64_t timeout_time_in_us) {
808 DCHECK(registered_);
809 token_ = eps_->ReregisterAlarm(token_, timeout_time_in_us);
810}
811
812} // namespace epoll_server