#!/usr/bin/perl -w # # =pod =head1 NAME backup.pl A script for building clusters. =head1 SYNOPSIS # Prepare host A for clustering cluster.pl --init=A # Refresh /usr/lib/backup cluster.pl --refresh=A # Make a backup of host A to host B cluster.pl --backup --source=A --destination=B # Create a local backup on /backup cluster.pl --backup --destination=/backup # In case: Copy /usr/lib/backup/A to / cluster.pl --install A =head1 DESCRIPTION This script keeps track of a Linux cluster. The general idea is as follows: There are certain files, which must not be mirrored, for example /etc/fstab (hardware dependent) or /etc/sysconfig/network- scripts/ifcfg-eth0 (fixes the current IP address on eth0) or /etc/rc.d/rc2.d/S64dhcpd (only one running DHCP server instance at the same time). To assure this, we create the following directory tree below F: A -- etc -- fstab.ignore | +- sysconfig -- network-scripts -- ifcfg-eth0 | +- rc.d -- rc2.d -- S65dhcpd | B -- etc -- fstab.ignore +- sysconfig -- network-scripts -- ifcfg-eth0 +- rc.d -- rc2.d -- S65dhcpd If host A is dropping service, we start host B by copying the files from F to F (the exception being files called F<*.ignore>) In theory host B should run as host A after an immediate reboot ... by ignoring trivial problems like boot sectors and all these kind of ugly things. Now for a more formal description (something completely different :-). We assume the following: =over 8 =item - There's a source machine A which ought to be left untouched. =item - There's a backup machine B, an almost identical copy of A, the exception being the IP address etc. The idea is that both machines ought to stay online at the same time. =item - If a file must not be mirrored, then we create a copy below F. For example, a copy of F will be stored in FmachineE/etc/hosts>. If the copy is called FmaschineE/etc/fstab.ignore> rather than FmaschineE/etc/fstab>, then the file will never be copied back. The script is responsible for refreshing these copies and synchronizing the machines. (It requires root permissions via ssh or ssh2 on both machines.) =item - Doing a backup means mirroring all files, the exception being those from F. Installing a backup means copying the files from F to F. =item - Whenever the backup machine shall replace the original, then we install the files from F to F and do a reboot. Host A needs to become powered off or we install the files from F to F and do a reboot, so that A becomes the backup server B. =cut use strict; use Getopt::Long (); use Socket (); use File::Basename (); use File::Path (); use File::Find (); use IO::File (); use IO::AtomicFile (); use File::Spec (); ############################################################################ # # Globale Variablen # ############################################################################ use vars qw($debug $verbose $BASEDIR $MY_NAME $SSH @INSTALLFILES $VERSION); $VERSION = "0.01"; $debug = 0; $verbose = 0; $BASEDIR = "/usr/lib/backup"; $MY_NAME = "mail"; # { # my $hostname = `hostname`; # if (defined($hostname) && $hostname =~ /sun2/) { # $MY_NAME = "sun2.zvw.de"; # } # } $SSH = "/usr/bin/ssh"; @INSTALLFILES = # # H-Net.com, RH Linux 5.2 # # qw(/etc/hosts /etc/sysconfig/network /etc/HOSTNAME # /etc/sysconfig/network-scripts/ifcfg-eth0 # /etc/resolv.conf /etc/conf.modules /etc/fstab # /root/.ssh/identity /root/.ssh/identity.pub # /etc/ssh/ssh_host_key /etc/ssh/ssh_host_key.pub # /etc/lilo.conf /etc/mtab /etc/rc.d/rc2.d/S65dhcpd # /etc/rc.d/rc3.d/S65dhcpd /etc/rc.d/rc4.d/S65dhcpd # /etc/rc.d/rc5.d/S65dhcpd # /etc/rc.d/rc3.d/S55named /etc/rc.d/rc4.d/S55named # /etc/rc.d/rc5.d/S55named); # # ZVW, Sparc Solaris 5.6 # # qw(/etc/inet/hosts /etc/net/ticlts/hosts # /etc/net/ticots/hosts /etc/net/ticotsord/hosts # /etc/hostname.le0 /etc/nodename /kernel/genunix # /root/.ssh/identity # /root/.ssh/identity.pub /root/.ssh/known_hosts # /root/.ssh/.ssh/authorized_keys /etc/ssh/ssh_host_key # /etc/ssh/ssh_host_key.pub /etc/vfstab); # # bgag.de, RH Linux 6.0, just a local backup # qw(/etc/fstab /etc/lilo.conf /etc/mtab); =head2 Preparing a machine for integration into the cluster. To integrate a machine into the cluster, we create the directory FhostE/>. This happens by executing cluster.pl --init= The list of files being created below FhostE/> is read from the global variable @INSTALLFILES. You need to adjust this variable before creating the cluster to your local settings. =cut sub Clone { my($source, $dest) = @_; my $dir = File::Basename::dirname($dest); unless (-d $dir) { print "Creating directory $dir.\n" if $verbose; File::Path::mkpath([$dir], 0, 0700) unless $debug; } print "Cloning $source to $dest.\n" if $verbose; return if $debug; my($dev, $ino, $mode, $nlink, $uid, $gid) = stat $source or die "Cannot stat $source: $!"; my $fhin = IO::File->new($source, "r") or die "Failed to open $source for input: $!"; my $fhout; $fhout = IO::AtomicFile->open($dest, "w") or die "Failed to open $dest for output: $!"; while (1) { my $line; my $result = $fhin->read($line, 4096); die "Failed to read $source: $!" unless defined($line); last unless $result; $fhout->print($line) or die "Failed to write to $dest: $!"; } die "Failed to close $dest: $!" unless $fhout->close(); chmod $mode, $dest or warn "Failed to change mode of $dest to $mode: $!"; chown $uid, $gid, $dest or warn "Failed to change ownership of $dest to $uid, $gid: $!"; } sub Init { my $o = shift; my $host = $o->{'init'} or die "Init: Missing host name"; die "Init: Cannot resolve host name $host" unless Socket::inet_aton($host); my $dir = File::Spec->catdir($BASEDIR, $host); die "Cannot init host $host: $dir already exists" if -e $dir; print "Creating directory $dir, mode 0700\n" if $verbose; File::Path::mkpath([$dir], 0, 0700) unless $debug; foreach my $file (@INSTALLFILES) { Clone($file, File::Spec->catfile($dir, $file)) if -f $file; } } =pod =head2 Refreshing the contents of /usr/lib/backup From time to time (I do it once a day) you need to refresh the contents of F This is happening by running cluster.pl --refresh=EhostE If EhostE is an external machine, then the refresh is done via C or C and C. You can fix the ssh path in the global variable $SSH. =cut sub Refresh { my $o = shift; my $host = $o->{'refresh'} or die "Init: Missing host name"; if ($host ne $MY_NAME) { my @command = ($SSH, $host, "perl", "$BASEDIR/cluster.pl", "--refresh=$host"); print "Refreshing remote directory: ", join(" ", @command), "\n" if $verbose; system @command unless $debug; my $v = $verbose ? "v" : ""; @command = ("rsync", "-az$v", "-e", $SSH, "$host:$BASEDIR/", "$BASEDIR/"); print "Refreshing local directory: ", join(" ", @command), "\n" if $verbose; system @command unless $debug; } else { my $dir = File::Spec->catdir($BASEDIR, $host); die "No such host in $dir: $host" unless -d $dir; my $wanted = sub { return if -d $_; my $t = $File::Find::name; my $f = $t; $f =~ s/\.ignore$//; if ($f =~ /^$BASEDIR\/.*?(\/.*)/) { $f = $1; } else { die "Cannot parse file name: $f"; } my $v = $verbose ? "v" : ""; my @command = ("rsync", "-a$v", $f, $t); print "Refreshing $t from $f: ", join(" ", @command), "\n" if $verbose; system @command unless $debug; }; File::Find::find($wanted, $dir); } } =pod =head2 Creating a backup A backup is created by running cluster.pl --backup --source=A --destination=B This synchronizes the contents of source host A with the destination host B. The cluster script distinguishes between a local backup (for example, if a machine has two disk drives, one for backup purposes) with cluster.pl --backup --source=A --destination=A/backup and a remote backup, where the host name differs in source and destination. In both cases the external command C is used for creating the backup and a list of exception files is created by looking into F. You are strongly encouraged to unmount a local backup partition by default. That means that the backup partition is using the I flag in F, for example as follows: /dev/hdb2 /backup ext2 noauto 0 0 In other words: We don't mount the partition at boot time. Instead the partition is mounted and unmounted manually, for example by calling the following script from within the crontab: #!/bin/sh mount /backup /usr/lib/backup/cluster.pl --backup --source=A --destination=A/backup umount /backup =cut sub Backup { my $o = shift; my $source = $o->{'source'} or die "Missing source host"; die "Cannot resolv host name: $source" unless Socket::inet_aton($source); my $sdir = File::Spec->catdir($BASEDIR, $source); die "No such host in $BASEDIR: $source" unless -d $sdir; my $dest = $o->{'destination'} or die "Missing destination host"; my $destdir = "/"; if ($dest =~ /(.*?)(\/.*)/) { $destdir = $2; $dest = $1; if ($dest ne $source) { die "Cannot perform a local backup between different hosts.\n"; } } die "Cannot resolv host name: $dest" unless Socket::inet_aton($dest); my $ddir = File::Spec->catdir($BASEDIR, $dest); die "No such host in $BASEDIR: $dest" unless -d $ddir; # Create a list of exception files. my %files; foreach my $d (qw(/proc /mnt/cdrom /mnt/floppy /backup)) { if (-d $d) { $files{"$d/*"} = 1; } } if ($source ne $MY_NAME || $dest ne $MY_NAME) { $files{"$BASEDIR/*"} = 1; } $files{"$destdir/*"} = 1 if $destdir ne "/"; my $wanted = sub { return if $_ =~ /\.ignore$/ || $_ =~ /^\.\.?$/ || -d $_; my $f = $File::Find::name; if ($f =~ /^$BASEDIR\/.*?(\/.*)/) { $files{$1} = 1; } else { die "Cannot parse file name: $f"; } }; File::Find::find($wanted, ($sdir eq $ddir) ? ($sdir) : ($sdir, $ddir)); my $v = $verbose ? "v" : ""; my @opts = ("rsync", "--delete", "-e", $SSH, ($source ne $MY_NAME || $dest ne $MY_NAME) ? "-az$v" : "-a$v", ($source ne $MY_NAME) ? "$source:/" : "/", ($dest ne $MY_NAME) ? "$dest:$destdir" : $destdir, map { ("--exclude", $_) } keys %files); print "Backup command: ", join(" ", @opts), "\n" if $verbose; system @opts unless $debug; if ($destdir eq "/") { print "Updating boot blocks: /sbin/lilo\n" if $verbose; system "/sbin/lilo" unless $debug; } } =pod =head2 Restoring a backup This is the worst case. Hope it never happens, and we have a good chance it never happens, now that we have a backup ... ;-) The backup is restored by running cluster.pl --install=EhostE followed by a reboot. Even by having a backup, success isn't guaranteed. Rebooting can fail for a number of reasons, for example the former F can become F (because the first drive doesn't become detected) or booting fails. I strongly advise you to have boot disks for both cases or one disk with an appropriate lilo boot block. If all else fails, a rescue disk can be of great help. With Red Hat Linux, you create a rescue disk like this: mount /mnt/cdrom cd /mnt/cdrom/images dd if=boot.img of=/dev/fd0 dd if=rescue.img of=/dev/fd0 Of course you can create a rescue disk from within DOS or Windows by using the F utility. =cut sub Install { my $o = shift; my $host = $o->{'install'}; my $dir = File::Spec->catdir($BASEDIR, $host); die "Failed to install $host: No such directory in $BASEDIR" unless -d $dir; die "Failed to install $host: This is me" if $host eq $MY_NAME; my $odir = File::Spec->catdir($BASEDIR, $MY_NAME); my %files; my $wanted = sub { return if $_ =~ /\.ignore$/ || $_ =~ /^\.\.?$/ || -d $_; my $f = $File::Find::name; return if -f "$f.ignore"; if ($f =~ /^$BASEDIR\/.*?(\/.*)/) { my $t = $1; $files{$f} = $t; } else { die "Cannot parse file name: $f"; } }; File::Find::find($wanted, $dir); print "The following files will be replaced, if we continue:\n\n"; while(my($var, $val) = each %files) { print " $val\n"; } print "\nDo you really want to continue? [n] \n"; my $reply = ; exit 1 unless $reply =~ /y/i; print "\nSure? [n] \n"; $reply = ; exit 1 unless $reply =~ /y/i; while(my($var, $val) = each %files) { Clone($var, $val); } } ############################################################################ # # Name: Usage # # Purpose: Print Usage message and exit # ############################################################################ sub Usage { print <<"EOF"; Usage: $0 [options] Possible actions are: --backup Create a backup from one machine to another machine. The options --source and --destinations have to be used for setting source and destination hosts. --init= Initialize the directory /usr/lib/backup/ for using in the cluster. --refresh= Refresh the contents of /usr/lib/backup/. --install= Install the original files from /usr/lib/backup/ into the current machine. Possible options are: --destination= Use as the destination machine in a backup. --source= Use as the source machine in a backup. --ssh= Use as ssh replacement (Default: /usr/bin/ssh1) See "perldoc $0" for details cluster.pl $VERSION, (C) 1999 by Jochen Wiedmann This script can be distributed and used under either the Artistic License or the GPL (GNU General Public License), as specified in the Perl README. In particular the author refuses any warranty. You are using this software at your own risk! EOF exit 1; } ############################################################################ # # This is main() # ############################################################################ { my %o; Getopt::Long::GetOptions(\%o, 'backup', 'debug', 'destination=s', 'init=s', 'install=s', 'help', 'refresh=s', 'source=s', 'ssh=s', 'verbose'); $verbose = 1 if $o{'verbose'} or $o{'debug'}; $debug = 1 if $o{'debug'}; $SSH = $o{'ssh'} if $o{'ssh'}; if ($< || $>) { die "Must be running as root"; } if ($o{'backup'}) { Backup(\%o); } elsif ($o{'init'}) { Init(\%o); } elsif ($o{'install'}) { Install(\%o); } elsif ($o{'refresh'}) { Refresh(\%o); } else { Usage(); } } 1; =pod =head1 EXAMPLE We run the following script daily at 4 am on host A: #!/bin/sh cd /usr/lib/backup ./cluster.pl --refresh=A ./cluster.pl --refresh=B ./cluster.pl --backup --source=A --destination=B mount /backup ./cluster.pl --backup --source=A --destination=A/backup umount /backup =head1 CPAN This script is available from any CPAN mirror, in particular ftp://ftp.funet.fi/pub/languages/perl/CPAN/authors/id/JWIED The following sections are merely for CPAN's automatic registration scheme: =head2 SCRIPT CATEGORIES Unix:System_administration =head2 PREREQUISITES The ssh (or ssh2) and rsync utilities. Additionally you need the Perl module B. =head2 README The cluster.pl script is a small script for maintaining a Linux cluster, to be precise, a cluster of machines with one original host and multiple backup hosts. =head1 AUTHOR Jochen Wiedmann Am Eisteich 9 72555 Metzingen Germany Email: joe@ispsoft.de This script can be distributed and used under either the Artistic License or the GPL (GNU General Public License), as specified in the Perl README. In particular the author refuses any warranty. You are using this software at your own risk! =head1 SEE ALSO L, L =cut