For more info, see http://www.xs4all.nl/~johnpc/inn



--- article.c	1997/02/08 14:19:13	1.1
+++ article.c	1997/03/19 12:12:10
@@ -1,4 +1,4 @@
-/*  $Revision: 1.1 $
+/*  $Revision: 1.39 $
 **
 **  Article-related routines.
 */
@@ -8,6 +8,19 @@
 #include "clibrary.h"
 #include "nnrpd.h"
 
+/*
+ * OVERSCREAM - to make the overview database screaming fast, and because
+ * I scream in terror about the previous implementation.
+ * See http://www.xs4all.nl/~johnpc/inn/ for more information on this patch.
+ */
+
+#define OVERSCREAM
+
+#ifdef OVERSCREAM
+# include <sys/types.h>
+# include <sys/mman.h>
+#endif /* OVERSCREAM */
+
 
 /*
 **  Data structures for use in ARTICLE/HEAD/BODY/STAT common code.
@@ -61,11 +74,33 @@
 /*
 **  Overview state information.
 */
+#ifdef OVERSCREAM
+
+STATIC caddr_t		OVERshm = (caddr_t) NULL; /* location of mmap	*/
+STATIC size_t		OVERsize;		/* size of mmap		*/
+STATIC size_t		OVERmsize;		/* real size of mmap	*/
+STATIC int		OVERfd;			/* fd of file 		*/
+STATIC ARTNUM		OVERfirst, OVERlast;	/* first/last entries	*/
+STATIC int		OVERopens;		/* Number of opens done	*/
+STATIC char*		OVERcache;		/* cached position	*/
+STATIC ARTNUM		OVERprev;		/* previous found art	*/
+#define LINSEARCH	5			/* linear search range  */
+#define MIDSKEW		0.1			/* 10% bias toward middle */
+
+STATIC int		mmapsuck;		/* did we syslog already */
+#define YOUR_MMAP_SUCKS if ( ! mmapsuck++ ) \
+    syslog(L_NOTICE, "Your mmap() implementation sucks.")
+
+#else /* !OVERSCREAM */
+
 STATIC QIOSTATE		*OVERqp;		/* Open overview file	*/
 STATIC char		*OVERline;		/* Current line		*/
 STATIC ARTNUM		OVERarticle;		/* Current article	*/
 STATIC int		OVERopens;		/* Number of opens done	*/
 
+#endif
+
+
 
 /*
 **  Read the overview schema.
@@ -676,6 +711,10 @@
     register int	i;
     ARTOVERFIELD	*fp;
     char		*next;
+#ifdef OVERSCREAM
+    char*		eol = strchr(p, '\n');
+#endif
+
 
     fp = &ARTfields[field - 1];
 
@@ -683,8 +722,13 @@
       field = ARTfirstfullfield;
  
     /* Skip leading headers. */
-    for (; --field >= 0 && *p; p++)
+    for (; --field >= 0 && *p && *p != '\n'; p++)
+#ifdef OVERSCREAM
+	if ((p = memchr(p, '\t', OVERsize - (p - OVERshm))) == NULL ||
+	    p > eol )
+#else
 	if ((p = strchr(p, '\t')) == NULL)
+#endif
 	    return NULL;
     if (*p == '\0')
 	return NULL;
@@ -702,10 +746,22 @@
     }
   
     /* Figure out length; get space. */
+
+#ifdef OVERSCREAM
+    if ((next = memchr(p, '\t', OVERsize - (p - OVERshm))) != NULL &&
+	  p < eol )
+	i = next - p;
+    else 
+	i = eol - p;
+
+#else /* !OVERSCREAM */
+
     if ((next = strchr(p, '\t')) != NULL)
 	i = next - p;
     else
 	i = strlen(p);
+#endif
+
     if (buffsize == 0) {
 	buffsize = i;
 	buff = NEW(char, buffsize + 1);
@@ -720,6 +776,430 @@
     return buff;
 }
 
+#ifdef OVERSCREAM
+
+/*
+ * helper function, search backwards in memory
+ */
+
+STATIC char*
+memrchr(p, c, l)
+    register char* p;
+    register char c;
+    register int l;
+{
+    for (; l--; --p)
+	if ( *p == c )
+	    return(p);
+    return(NULL);
+}
+
+/*
+ * mmap an OVERVIEW file.
+ */
+
+STATIC BOOL
+OVERopen()
+{
+    char		name[SPOOLNAMEBUFF];
+    struct stat 	sb;
+    char*		p;
+    static int		pagesize = 0;
+
+    /* return true if already mapped */
+    if ( OVERshm ) {
+	return TRUE;
+    }
+    /* return false if already failed */
+    if ( OVERopens++ ) {
+	return FALSE;
+    }
+    /* get memory pagesize if we don't have it already */
+    if ( ! pagesize && !
+#ifdef _SC_PAGE_SIZE
+	(pagesize = sysconf(_SC_PAGE_SIZE))
+#else
+# ifdef _SC_PAGESIZE
+	(pagesize = sysconf(_SC_PAGESIZE))
+# else
+	(pagesize = getpagesize())
+# endif
+#endif
+	) {
+	syslog(L_NOTICE, "%s: Can't getpagesize", ClientHost);
+	return FALSE;
+    }
+    /* mmap the file */
+    (void)sprintf(name, "%s/%s/%s", _PATH_OVERVIEWDIR, GRPlast, _PATH_OVERVIEW);
+    if ( (OVERfd = open(name, O_RDONLY)) < 0 ) {
+	/* no overview file */
+	syslog(L_NOTICE, "%s can't open %s: %m", ClientHost, name);
+	return FALSE;
+    }
+    if ( fstat(OVERfd, &sb) == -1 ) {
+	syslog(L_NOTICE, "%s can't stat %s: %m", ClientHost, name);
+	(void)close(OVERfd);
+	return FALSE;
+    }
+    if ( (OVERsize = sb.st_size) <= 1 ) {
+	syslog(L_NOTICE, "%s: %s is too small", ClientHost, name);
+	(void)close(OVERfd);
+	return FALSE;
+    }
+    OVERmsize = (OVERsize + pagesize - 1) & ~(pagesize - 1);
+    if ( (OVERshm = mmap(NULL, OVERmsize, PROT_READ, MAP_SHARED, OVERfd, 0))
+	 == (caddr_t) -1 )
+    {
+	syslog(L_NOTICE, "%s can't mmap %s: %m", ClientHost, name);
+	(void)close(OVERfd);
+	OVERshm = NULL;
+	return FALSE;
+    }
+    /* get first entry */
+    if ( (OVERfirst = atol((char*) OVERshm)) == 0 ) {
+	syslog(L_NOTICE, "%s: %s: bad format", ClientHost, name);
+	(void)munmap(OVERshm, OVERmsize);
+	(void)close(OVERfd);
+	OVERshm = NULL;
+	return FALSE;
+    }
+
+    /* get last entry */
+    if ( *(OVERshm + OVERsize - 1) != '\n' ) {
+	/*
+	 * If you get here, then your mmap() implementation sucks.
+	 * Go complain with your OS vendor, that their mmap() can't
+	 * do mmap()ing of growing files properly.
+	 * We try to find a decent record near the end, for the poor
+	 * sobs without proper mmap. There are a lot of other places
+	 * in the code with hacks for bad mmap(). Mainly because I'm
+	 * one of the poor sobs :(
+	 */
+	YOUR_MMAP_SUCKS;
+    }
+    do {
+	/*
+         * Try to find any newline. If there isn't any, the entire file
+	 * is crap. Normally this finds the newline right at the end.
+	 */
+	p = memrchr(OVERshm + OVERsize - 1, '\n', OVERsize - 1);
+	if ( p == NULL ) {
+	    /* overview file only contains garbage. */
+	    (void)munmap(OVERshm, OVERmsize);
+	    (void)close(OVERfd);
+	    OVERshm = NULL;
+	    return FALSE;
+	}
+	OVERsize = p - OVERshm + 1;
+	if ( (p = memrchr((char*) OVERshm + OVERsize - 2, '\n',
+			    OVERsize - 2)) == NULL )
+	{
+	    /* Apparently only 1 (usable) line */
+	    OVERlast = OVERfirst;
+	    OVERcache = NULL;
+	    return TRUE;
+	}
+	OVERlast = atol(p+1);
+    }
+    while ( OVERlast == 0 && --OVERsize );
+
+    if ( !OVERsize ) {
+	(void)munmap(OVERshm, OVERmsize);
+	(void)close(OVERfd);
+	OVERshm = NULL;
+	return FALSE;
+    }
+
+    OVERcache = NULL;
+    return TRUE;
+}
+
+/*
+ * Close an overview file, if any.
+ */
+
+void
+OVERclose()
+{
+    if ( OVERshm ) {
+	if ( munmap(OVERshm, OVERmsize) == -1 ) {
+	    syslog(L_NOTICE, "%s can't munmap: %m", ClientHost);
+	}
+	(void)close(OVERfd);
+	OVERshm = NULL;
+    }
+    OVERopens = 0;
+}
+
+/*
+ * find an overview article using binary search in the overview file.
+ * Returns a pointer to the actual line in the overview file (so it's
+ * !!NOT!! null terminated, and can't be written to!!), or NULL on failure.
+ */
+
+STATIC char*
+OVERfind(artnum)
+    ARTNUM	artnum;
+{
+    char*	bottom;
+    char*	top;
+    ARTNUM	bottomnr;
+    ARTNUM	topnr;
+    char*	pos;
+    ARTNUM	nr;
+    int		i;
+
+    /* default startpos */
+    bottom = OVERshm;
+    bottomnr = OVERfirst;
+    top = OVERshm + OVERsize - 1;
+    topnr = OVERlast;
+
+    if ( OVERcache ) {
+	/*
+	 * for speedy sequential access. OVERcache, if non-NULL, points to
+	 * the "next" entry. OVERprev is the previous article number found.
+	 * Also check for sucking mmap() implementations.
+	 */
+	if ( *OVERcache == '\0' ) {
+	    YOUR_MMAP_SUCKS;
+	    OVERcache = memchr(OVERcache, '\n',
+				OVERsize - (OVERshm - OVERcache));
+	    if ( OVERcache == NULL || OVERcache == OVERshm + OVERsize - 1 ) {
+		OVERcache = NULL;
+		return NULL;
+	    }
+	    OVERcache++;
+	}
+	nr = atol(OVERcache);
+	if ( nr < OVERfirst || nr > OVERlast ) {
+	    /* boo */
+	    OVERcache = NULL;
+	    return NULL;
+	}
+	if ( nr == artnum ) {
+	    pos = OVERcache;
+	    goto bingo; /* calculate next OVERcache + return. (EW! a goto! :) */
+	}
+	else if ( artnum > nr ) {
+	    /* treat cache as first binary search */
+	    bottom = OVERcache;
+	    bottomnr = nr;
+	}
+	else {
+	    /* cache is first top */
+	    top = OVERcache - 1;
+	    topnr = nr - 1;
+	    if ( artnum > OVERprev ) {
+		/*
+		 * optimization: we're searching for something that isn't
+		 * in the database, but we want to keep the cache clean.
+		 * this occurs when we think an article is there, but it
+		 * really isn't, eg. because NOSCANDIR is on, or simply
+		 * because the overview database leaks.
+		 */
+		return(NULL);
+	    }
+	}
+    }
+
+    /* sanity check */
+    if ( artnum < bottomnr || artnum > topnr ) {
+	OVERcache = NULL;
+	return NULL;
+    }
+
+    for (;;) {
+	/*
+	 * This is the binary search loop, there are about a zillion
+	 * exits so I found it neater to code it in an endless loop :)
+	 * It simply continues until it is either found or it isn't...
+	 *
+	 * Note that we don't do a real binary search, but we guess
+	 * a position using the fact that the overview database usually
+	 * contains a reasonably linear range of articles, without any
+	 * big leaps, but we skew it a bit towards the middle to prevent
+	 * slow convergence in boundary cases (see also below).
+	 *
+	 * We switch to linear searching when we're "close",
+	 * because on short ranges, linear searches are about as fast
+	 * (or faster) anyway. LINSEARCH is currently guessed at 5,
+	 * because on average it takes 2.5 searches using a linear search,
+	 * where it usually takes 3 "straight" binary searches.
+	 *
+	 * Unfortunately, we can't be sure we get into linear search when
+	 * we're close, because the database may have large holes.
+	 */
+	/* test if it's near the bottom */
+	if ( artnum < bottomnr + LINSEARCH ) {
+	    i = 0;
+	    while ( artnum > bottomnr && i++ < LINSEARCH ) {
+		/* search next line */
+		bottom = memchr(bottom, '\n', OVERsize - (bottom - OVERshm));
+		if ( bottom == NULL || bottom == top + 1 ) {
+		    /* reached end of file */
+		    OVERcache = NULL;
+		    return NULL;
+		}
+		if ( *++bottom == 0 ) {
+		    YOUR_MMAP_SUCKS;
+		    continue;
+		}
+		bottomnr = atol(bottom);
+		if ( bottomnr < OVERfirst || bottomnr > OVERlast ) {
+		    OVERcache = NULL;
+		    return NULL;
+		}
+	    }
+	    if ( artnum == bottomnr ) {
+		pos = bottom;
+		goto bingo; /* calculate next OVERcache + return. */
+	    }
+	    else {
+		/* didn't find it, but we came close. still cache position */
+		OVERcache = bottom;
+		OVERprev = artnum;
+		return NULL;
+	    }
+	    /*NOTREACHED*/
+	}
+	/* test if it's near the top */
+	if ( artnum > topnr - LINSEARCH ) {
+	    /*
+	     * topnr is frequently guessed, so we must first determine it
+	     * correctly. The fun part about searching backwards is that
+	     * the next position (OVERcache) follows easily...
+	     */
+	    i = 0;
+	    do {
+		OVERcache = (top == OVERshm + OVERsize - 1) ? NULL : top + 1;
+		if ( (top = memrchr(--top, '\n', top - OVERshm))
+		    == NULL || top + 1 == bottom )
+		{
+		    /* search hit bottom */
+		    OVERcache = NULL;
+		    return NULL;
+		}
+		if ( *(top + 1) == 0 ) {
+		    YOUR_MMAP_SUCKS;
+		    /* make sure we continue */
+		    topnr = artnum + 1;
+		    continue;
+		}
+		topnr = atol(top + 1);
+		if ( topnr < OVERfirst || topnr > OVERlast ) {
+		    OVERcache = NULL;
+		    return NULL;
+		}
+	    }
+	    while ( artnum < topnr && i++ < LINSEARCH );
+	    if ( artnum == topnr ) {
+		/* bingo. This time we know OVERcache already */
+		OVERprev = artnum;
+		return(top + 1);
+	    }
+	    else {
+		/* not found, but close. cache position */
+		OVERprev = artnum;
+		return NULL;
+	    }
+	    /*NOTREACHED*/
+	}
+
+	/*
+	 * now for the real binary search:
+	 * Estimate the position of artnum, but with a small offset towards
+	 * the middle, for better convergence in case the set of articles
+	 * is non-linear (you get a straight binary search if MIDSKEW is 1.0).
+	 * MIDSKEW is currently determined using a big thumb, occultism,
+	 * astrology, cat /dev/uri-geller and some common sense (but not much)
+	 * MIDSKEW == 0.0 makes the search take only 1 iteration in case
+	 * the overview database is a monotonous array of lines with equal
+	 * length, but can make for really lousy searches in anything not like
+	 * the above, which, in the real world, is practically always.
+	 * MIDSKEW == 1.0 gives you a true binary search without any guessing
+	 * whatsoever.
+	 * I thought 10% would be good enough. Only riggid testing can
+	 * determine the optimal value, and then it still depends on a lot
+	 * of settings, like expire times, user newsgroups preference,
+	 * presence of cancelbots or cancelwars, frequency of expireover
+	 * runs... need I say more? :)
+	 */
+	if ( topnr <= bottomnr ) {
+	    /* Safety net. This REALLY should never happen. */
+	    syslog(L_NOTICE,
+		   "%s: ASSERTION FAILED: %d < %d looking for %d in %s",
+		   ClientHost, topnr, bottomnr, artnum, GRPlast);
+	}
+	pos = bottom + (int) ((double) (top - bottom) * (MIDSKEW * 0.5) +
+			      (top - bottom) * (1.0 - MIDSKEW) *
+			      (artnum - bottomnr) / (topnr - bottomnr));
+	/* search forward for newline */
+	if ( (pos = memchr(pos, '\n', OVERsize - (pos - OVERshm))) == NULL ) {
+	    /* this shouldn't happen */
+	    OVERcache = NULL;
+	    return NULL;
+	}
+	if ( pos == top ) {
+	    /* hmm... */
+	    if ( (pos = memrchr(--pos, '\n', pos - OVERshm))
+		 == NULL || pos == bottom - 1 )
+	    {
+		/*
+		 * This is what happens when there's a large hole and we're
+		 * looking for something inside the hole (which isn't there).
+		 * still record the position in this case...
+		 */
+		OVERcache = (top == OVERshm + OVERsize - 1) ? NULL : top + 1;
+		OVERprev = artnum;
+		return NULL;
+	    }
+	}
+	/* see where we are */
+	if ( *++pos == 0 ) {
+	    YOUR_MMAP_SUCKS;
+	    pos = memchr(pos, '\n', OVERsize - (pos - OVERshm));
+	    if ( pos == NULL || pos == OVERshm + OVERsize - 1 || pos == top ) {
+		OVERcache = NULL;
+		return NULL;
+	    }
+	    pos++;
+	}
+	nr = atol(pos);
+	if ( nr < OVERfirst || nr > OVERlast ) {
+	    OVERcache = NULL;
+	    return NULL;
+	}
+	if ( nr == artnum ) {
+	    /* bingo. Set cache to next entry */
+bingo:
+	    OVERcache = memchr(pos, '\n', OVERsize - (pos - OVERshm));
+	    if ( OVERcache == OVERshm + OVERsize - 1 )
+		OVERcache = NULL;
+	    else if ( OVERcache )
+		OVERcache++;
+	    OVERprev = artnum;
+	    return (pos);
+	}
+	if ( nr < artnum ) {
+	    /* found a new bottom */
+	    bottom = pos;
+	    bottomnr = nr;
+	}
+	else /* nr > artnum */ {
+	    /*
+	     * found a new top. Setting topnr to nr-1 is not entirely
+	     * correct, but who cares. (In fact we do care, but adjust
+	     * later :)
+	     */
+	    top = pos - 1;
+	    topnr = nr - 1;
+	}
+    }
+    /*NOTREACHED*/
+}
+
+#else /* !OVERSCREAM */
 
 /*
 **  Open an OVERVIEW file.
@@ -756,7 +1236,7 @@
     if (OVERqp != NULL) {
 	QIOclose(OVERqp);
 	OVERqp = NULL;
-	OVERopens = 0;
+	OVERopens = 0; /* this is a bug */
     }
 }
 
@@ -789,6 +1269,8 @@
     return OVERarticle == artnum ? OVERline : NULL;
 }
 
+#endif
+
 
 /*
 **  Read an article and create an overview line without the trailing
@@ -1078,12 +1560,22 @@
 	if (ARTfind(i) < 0)
 	    continue;
 
+	/*OVERVIEWcount++;*/
 	if (Opened && (p = OVERfind(i)) != NULL) {
+#ifdef OVERSCREAM
+	    char* eol = memchr(p, '\n', OVERsize - (p - OVERshm));
+	    if ( eol == NULL )
+		continue; /* this should NEVER NEVER EVER NEVER EVER happen */
+	    fwrite(p, 1, eol - p, stdout);
+	    fwrite("\r\n", 1, 2, stdout);
+#else
 	    Printf("%s\r\n", p);
+#endif
 	    continue;
 	}
 
 	(void)sprintf(buff, "%ld", i);
+	/*OVERGENcount++;*/
 	if ((p = OVERgen(buff)) != NULL)
 	    Printf("%s\r\n", p);
     }
