The objective is to implement the most basic working fastcgi protocol in pure racket.
How it works
The minimal fastcgi implementation will look like:
- wait on a tcp connection
- read all input data
- write output hello world like data
- close connection
A request is composed of parts denoted here as segment. Each segment is composed of a header of 8 bytes and some data of N bytes and padding of M bytes where N and M >=0.
The header is:
- fastcgi version (don’t care)
- segment type (init request, end request, stdin, data, stdout, strerr etc…)
- request id
- request id (don’t care, it is useful when the connection is shared for multiple request, not the case here)
- data length
- data length
- padding length
- void
Racket implementation
The decoding incoming bytes part:
#lang racket
;; to store the input data
(struct request [segment-id stdin data params] #:mutable #:transparent)
(define (create-empty-request)
(request #f (open-output-bytes) (open-output-bytes) (open-output-bytes)))
(define (decode-segment request in out)
;; don't care if read-bytes return eof
(define header (read-bytes 8 in))
(define segment-type (bytes-ref header 1))
(define segment-id (+ (arithmetic-shift (bytes-ref header 2) 8)
(bytes-ref header 3)))
;; don't care if read-bytes return eof
(define data-length (+ (arithmetic-shift (bytes-ref header 4) 8)
(bytes-ref header 5)))
(define data (read-bytes data-length in))
;; consume the padding bytes
(void (read-bytes (bytes-ref header 6) in))
(printf "segment type: ~a\n" segment-type)
(printf "id: ~a\n" segment-id)
(printf "data length: ~a\n" data-length)
(printf "~a\n" data)
;; segment-id = 0 is a special case where the web server and the fastcgi
;; process exchange information like n° of maximum connection so... we don't
;; care.
(when (= 0 segment-id)
(raise "unimplemented"))
(printf "~a\n" request)
(case segment-type
;; fcgi beginrequest
((1) (set-request-segment-id! request segment-id))
;; fcgi params
((4) (display data (request-params request)))
;; fcgi stdin
((5) (if (= 0 data-length) ; Not sure if it's the rigth way but use that to
; know if the webserver input is over.
(respond request out)
(display data (request-stdin request))))
;; fcgi data
((8) (display data (request-data request)))
;; fcgi get values
;; fcgi abort request
((2 9) (raise "unimplemented"))
(else (raise "undocumented type"))))
The response:
(define (build-header type id data-length padding-length)
(bytes 1 type
(arithmetic-shift id -8) (modulo id 256)
(arithmetic-shift data-length -8) (modulo data-length 256)
padding-length 0))
(define (respond request out)
;; header of the stdout
(write-bytes (build-header 6 (request-segment-id request) 64 0) out)
;; the stdout data
(write-bytes
#"Content-type: text/html\r\n\r\n<html><body>hello fcgi</body></html> "
out)
;; an empty stdout data to indicate we have finish the stdout stream
(write-bytes (build-header 6 (request-segment-id request) 0 0) out)
;; the end request type
(write-bytes (build-header 3 (request-segment-id request) 8 0) out)
(write-bytes (bytes 0 0 0 0 0 0 0 0) out)
(close-output-port out))
The server:
(module+ main
(require racket/tcp)
(define server (tcp-listen 1025))
(let loop-main ()
(let-values ([(in out) (tcp-accept server)]
[(request) (create-empty-request)])
(let loop ()
(if (port-closed? out)
;; the end of the request
(close-input-port in)
(begin
(decode-segment request in out)
(loop))))
)
(loop-main)))
Nginx configuration:
location ~ \.rkt$ {
fastcgi_pass 127.0.0.1:1025;
include fastcgi.conf; # this add several parameters to be pass
# to the fastcgi process like http method.
}
go to http://localhost/anything.rkt that display “hello fcgi”
Display the HTTP headers
First of all we need to extract the parameters form the request-params field, it is name, value pair with their respective length before.
(define (decode-params in)
(define (decode-param-length in)
(define b3 (read-byte in))
(cond [(equal? b3 eof) eof]
[(> b3 #x7f)
(let ([others (read-bytes 3 in)])
(+
(arithmetic-shift (bitwise-and b3 #x7f) 24)
(arithmetic-shift (bytes-ref others 2) 16)
(arithmetic-shift (bytes-ref others 1) 8)
(bytes-ref others 0)))]
[else b3]))
(let loop ([result empty])
(let ([length0 (decode-param-length in)])
(if (equal? eof length0)
result
(let ([length1 (decode-param-length in)])
(loop
(cons
(cons (read-bytes length0 in)
(read-bytes length1 in))
result)))))))
Apply this decoder and present them, also do convenience functions.
(define (fcgi-end-request id out)
(write-bytes (build-header 3 id 8 0) out)
(write-bytes (bytes 0 0 0 0 0 0 0 0) out)
(close-output-port out))
(define (write-fcgi-stdout id data out [close #f])
(define data-length (bytes-length data))
(define padding-length (if (= 0 (modulo data-length 8)) 0 (- 8 (modulo data-length 8))))
(write-bytes (build-header 6 id data-length padding-length) out)
(write-bytes data out)
(when (not (= 0 padding-length))
(write-bytes (make-bytes padding-length) out))
(when close
(write-bytes (build-header 6 id 0 0) out)))
(define (respond request out)
;; get the data from request
(define params (decode-params
(open-input-bytes (let ([port (request-params request)])
(close-output-port port)
(get-output-bytes port)))))
(define data (open-output-bytes))
(write-bytes #"Content-type: text/html\r\n\r\n<html><body>" data)
(write-bytes #"<table>" data)
(for ([param params])
(write-bytes #"<tr><td>" data)
(write-bytes (car param) data)
(write-bytes #"</td><td>" data)
(write-bytes (cdr param) data)
(write-bytes #"</td></tr>" data))
(write-bytes #"</table>" data)
(write-bytes #"</body></html>" data)
(close-output-port data)
(write-fcgi-stdout (request-segment-id request)
(get-output-bytes data)
out
#t)
;; the end request type
(fcgi-end-request (request-segment-id request) out))
Performance expectation
To have an idea where the n° of requests per seconds I used ab -c 25 -n 10000
on several configurations:
- racket web/server (a web server written in pure racket): 1300
- nginx + chicken fastcgi binding: 10000
- nginx + php fpm (5 worker thread seems to be the optimal on my 2 core cpu): 7700
- nginx: 18000
- my stuff: 300
Wait what 300, something is wrong obviously, furthermore ab
get stuck most of the time at the last 1000 request. After suspecting the gc I find that I didn’t read the doc for tcp-listen and changed the max-allow-wait to something like 1024. And get then 2500 request/second, even more by removing printf to 6500 r/s.
Adding threads
The server can still handle one request at a time waiting for the first to finish. Using the green thread of racket will enable to execute request concurrently.
The change to the code is straightforward just create a new thread for every incoming request.
(module+ main
(require racket/tcp)
(define server (tcp-listen 1025 1024 #t))
(let loop-main ()
(let-values ([(in out) (tcp-accept server)]
[(request) (create-empty-request)])
(thread
(lambda ()
(let loop ()
;; similate work
(sleep sleeping-time)
(if (port-closed? out)
;; the end of the request
(close-input-port in)
(begin
(decode-segment request in out)
(loop))))
))
)
(loop-main)))
To show the difference from before lets simulate some work with the sleeping-time variable and use ab -c 100 -n 10000
to run our experiments.
From the previous non threaded code:
- sleeping-time=0.0001 -> 1600 request/s
- sleeping-time=0.001 -> 196 request/s
With threaded code:
- sleeping-time=0.01 -> 1900 request/s
Even for heavier work the threaded code is faster.
A round robin strategy
Instead of create a new thread and a new request every time we have an incoming request, create a bunch of thread and select them through a round robin approach.
(module+ main
(require racket/tcp)
(define server (tcp-listen 1025 1024 #t))
(define max-thread 100)
(define round-robin
(let ([robin 0])
(lambda ()
(define rob (modulo (add1 robin) max-thread))
(set! robin rob)
rob)))
(define (create-thread request)
(define (reset-request! request)
(set-request-segment-id! request #f)
(set-request-stdin! request (open-output-bytes))
(set-request-data! request (open-output-bytes))
(set-request-params! request (open-output-bytes)))
(thread
(lambda ()
(let main-loop ()
(define in (thread-receive))
(define out (thread-receive))
(reset-request! request)
(let loop ()
(sleep 0.0001)
(if (port-closed? out)
;; the end of the request
(close-input-port in)
(begin
(decode-segment request in out)
(loop))))
(main-loop)))))
(define thread-vec (for/vector ([i (range max-thread)])
(create-thread (create-empty-request))))
(let loop-main ()
(let-values ([(in out) (tcp-accept server)]
#;[(request) (create-empty-request)]
[(rob) (round-robin)])
(thread-send (vector-ref thread-vec rob) in)
(thread-send (vector-ref thread-vec rob) out)
)
(loop-main)))
I have no idea which approach is better, if I increase the number of concurrent connection in apache bench the round robin seems to do better, but this is maybe just due to the fact that round robin set a fixed maximum number of threads.
Gambit implementation
Quickly do a gambit implementation and get 6800 request/second.