[PVFS2-developers] metadata record pre-allocation

Murali Vilayannur vilayann at mcs.anl.gov
Tue Jul 27 17:57:01 EDT 2004


Hi Phil,

I have written a small micro-benchmark that simulates a scenario that you 
just described. (write a lot of information to a large number of files,
and a small amt of information to one special file, and then measure 
timings for 2 cases, a) when you call sync, b) when you call fsync on that
special file alone).
What one would expect is that the overall time to complete a sync() should 
be an order of magnitude higher than the times for doing just a fsync().

You can run the program like
./a.out -d /tmp/test/ -n 30
will create 30 files in /tmp/test and do a fsync() on one of them.
To time the sync() performance, you can run like
./a.out -d /tmp/test -n 30 -s.

(Creating 30 files will write roughly 90 MB of data to the file system)

Ok, now for the results of this test,
If I run it on my laptop's ext3 filesystem, the timings for fsync() and 
sync() are roughly of the same magnitude(!), somewhat confirming your 
hypothesis. 
However, I tried it on my machine at school that has reiserfs and the 
timings for fsync() is way smaller than the timings for sync(), which 
seems to indicate that the underlying file system seems to be responsible
in this particular scenario where a fsync seems to result in a full sync.

It would be interesting to see if anyone finds similar behavior on say 
jfs, reiserfs4, xfs and so on..
thanks,
Murali

> As a side note somewhat orthogonal to this conversation, I never did figure 
> out why calling fsync on a tiny db file clogged the whole server.  I would 
> assume this is dependent on the kernel and underlying fs.  It appeared that 
> an fsync call on any file, no matter how small, resulted in a full sync of 
> the file system.  So for example, if you had just written 100 MB out to disk 
> but hadn't synced, then the next fsync on a completely unrelated db file 
> would suddenly pay the cost of hitting disk for 100 MB.  It was pretty easy 
> to see this in action when importing large files.
> 
> If the fsyncs truly really worked as advertised and only caused the 4K 
> database file to be synced, it doesn't seem like these sync stalls would be 
> as painful.  If anyone has any insight into that issue (Is this known linux 
> behavior?  I am off base in expecting otherwise?) I would be curious to hear.
> 
> -Phil
> _______________________________________________
> PVFS2-developers mailing list
> PVFS2-developers at beowulf-underground.org
> http://www.beowulf-underground.org/mailman/listinfo/pvfs2-developers
> 
> 
-------------- next part --------------
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <fcntl.h>
#include <sys/types.h>
#include <errno.h>
#include <string.h>
#include <strings.h>

#define  NFILES	 512
#define  BASE_PATH "/tmp"

struct files {
	int fd;
	int path_id;
};

static struct files *all_fds = NULL;
static int nfiles, do_sync = 0;
static char *base_path = NULL;

static int open_files(void)
{
	int i;

	all_fds = (struct files *) calloc(nfiles, sizeof(struct files));
	if (all_fds == NULL)
	{
		fprintf(stderr, "Could not allocate memory for all_fds: %s\n",
				strerror(errno));
		return -1;
	}
	for (i = 0; i < nfiles; i++)
	{
		char str[256];

		all_fds[i].path_id = rand() % 1024;
		snprintf(str, 256, "%s/%d_%3d", base_path, i, all_fds[i].path_id);
		all_fds[i].fd = open(str, O_RDWR | O_CREAT | O_TRUNC, 0700);
		if (all_fds[i].fd < 0)
		{
			fprintf(stderr, "Could not open %s: %s\n",
					str, strerror(errno));
			break;
		}
		//printf("[%d] -> (%s, %d)\n", i, str, all_fds[i].fd);
	}
	if (i != nfiles)
	{
		int j;

		for (j = 0; j < i; j++)
		{
			char str[256];
	
			close(all_fds[j].fd);
			snprintf(str, 256, "%s/%d_%3d", base_path, j, all_fds[j].path_id);
			unlink(str);
		}
		free(all_fds);
		return -1;
	}
	return 0;
}

static int close_files(void)
{
	int i;
	
	for (i = 0; i < nfiles; i++)
	{
		char str[256];
		close(all_fds[i].fd);
		snprintf(str, 256, "%s/%d_%3d", base_path, i, all_fds[i].path_id);
		unlink(str);
	}
	free(all_fds);
	return 0;
}

static double Wtime(void)
{
	double t = 0.0;
	struct timeval t1;
	gettimeofday(&t1, NULL);
	t = (t1.tv_sec * 1e06 + t1.tv_usec);
	return t;
}

/*
 * Generate a lot of I/O by writing
 * to random portions of a majority of the files.
 * and try doing fsync's() on a small file
 * try and measure its performance.
 */
#define BSIZE 4096

static int do_fsync_tests(void)
{
	int special_file_index = 0, i;
	long total_data_written = 0;
	char *buf = NULL;
	double begin, end;

	special_file_index = rand() % nfiles;

	buf = (char *) calloc(BSIZE, sizeof(char));
	assert(buf);
	memset(buf, rand() % 255, BSIZE);

	printf("Special file desc.= %d\n", all_fds[special_file_index].fd);
	for (i = 0; i < nfiles; i++)
	{
		int count, j;

		if (i != special_file_index)
		{
			/* write a lot of data for this file */
			count = rand() % 512 + 512;
		}
		else
		{
			/* write a little bit for this file alone */
			count = 1;
		}
		for (j = 0; j < count; j++)
		{
			if (write(all_fds[i].fd, buf, BSIZE) < 0)
			{
				fprintf(stderr, "write on %d failed: %s\n",
						all_fds[i].fd, strerror(errno));
				break;
			}
			/* Don't call fsync on any of them */
		}
		printf("Total data written[%d] = %g KB\n", 
				all_fds[i].fd, (count * BSIZE) / (1024.0));
		total_data_written += (count * BSIZE);
	}
	printf("Sum of all data written: %g MB\n",
			(total_data_written / (1024.0 * 1024)));
	/* now lets call fsync() and measure the time it took on the small file */
	if (do_sync)
	{
		printf("Calling sync()\n");
	}
	else
	{
		printf("Calling fsync on %d\n", all_fds[special_file_index].fd);
	}
	begin = Wtime();
	if (do_sync)
	{
		sync();
	}
	else
	{
		if (fsync(all_fds[special_file_index].fd) < 0)
		{
			fprintf(stderr, "fsync on %d failed: %s\n",
					all_fds[special_file_index].fd, strerror(errno));
			return -1;
		}
	}
	end = Wtime();
	printf("%s timed at %g msec (average: %g msec per file)\n",
			do_sync ? "sync" : "fsync", (end - begin) * 1e-03,
			do_sync ? (end - begin) * 1e-03/nfiles : (end - begin) * 1e-03);

	begin = Wtime();
	close_files();
	end = Wtime();
	printf("close of %d files timed at %g msec (avg: %g usec)\n",
			nfiles, (end - begin) * 1e-03, (end - begin) / nfiles);
	return 0;
}

static void usage(char *str)
{
	fprintf(stderr, "Usage: %s -n <number of files>"
			"-d <base directory> -s {use sync instead of fsync}\n", str);
	return;
}

int main(int argc, char *argv[])
{
	int c = 0;

	srand(time(NULL));
	nfiles = NFILES;
	base_path = BASE_PATH;
	while ((c = getopt(argc, argv, "n:d:sh")) != EOF)
	{
		switch(c)
		{
			case 'n':
				nfiles = atoi(optarg);
				break;
			case 'd':
				base_path = optarg;
				break;
			case 's':
				do_sync = 1;
				break;
			case 'h':
			default:
				usage(argv[0]);
				exit(1);
		}
	}
	if (open_files() < 0)
	{
		exit(1);
	}
	do_fsync_tests();
	close_files();
	return 0;
}


More information about the PVFS2-developers mailing list