Wesley @BerryLab

Tuesday, August 26, 2008

[LINUX] Gentoo Tips

solve system packages blocking

[blocks B] pkg1 (is blocking pkg2)

$ emerge --buildpkgonly --nodeps pkg2 && emerge -C pkg1 && emerge --usepkgonly pkg2

from

傻蛋碎碎唸: Gentoo emerge 套件發現 blocking 時的安全解法: "emerge --buildpkgonly --nodeps util-linux && emerge -C setarch && emerge --usepkgonly util-linux"

Monday, August 25, 2008

[SYSLOG] manage system logging

my syslog-ng.conf

# Syslog-ng default configuration file for Gentoo Linux
# contributed by Michael Sterrett

options {
        chain_hostnames(off);
        sync(0);

        # The default action of syslog-ng 1.6.0 is to log a STATS line
        # to the file every 10 minutes.  That's pretty ugly after a while.
        # Change it to every 12 hours so you get a nice daily update of
        # how many messages syslog-ng missed (0).
        stats(43200);
};

source src {
    unix-stream("/dev/log" max-connections(256));
    internal();
    file("/proc/kmsg");
};

# mcelog
filter mcelog { program(mcelog); };
destination mcelog{ file("/var/log/mcelog"); };
log { source(src); filter(mcelog); destination(mcelog); };

# dhcpd
filter dhcpd { program("dhcpd"); };
destination dhcpd { file("/var/log/dhcpd"); };
log { source(src); filter(dhcpd); destination(dhcpd); };

# postfix
#filter postfix { facility(mail) or match("postfix"); };
filter mail { facility(mail); };
destination mail { file("/var/log/maillog"); };
log { source(src); filter(mail); destination(mail); };

# cron
filter cron { facility(cron); };
destination cron { file("/var/log/cron"); };
log { source(src); filter(cron); destination(cron); };

# sshd, su
#filter secure { facility(auth,authpriv) or program("sshd"); };
filter secure { facility(auth,authpriv); };
destination secure { file("/var/log/secure"); };
log { source(src); filter(secure); destination(secure); };

# syslog-ng
filter syslog { facility(syslog); };
destination syslog { file("/var/log/syslog"); };
log { source(src); filter(syslog); destination(syslog); };

# message log
filter messages { not filter(cron) and not filter(mail)
        and not filter(dhcpd) and not filter(secure) and not filter(syslog); };
destination messages { file("/var/log/messages"); };
log { source(src); filter(messages); destination(messages); };

# By default messages are logged to tty12...
destination console_all { file("/dev/tty12"); };
# ...if you intend to use /dev/console for programs like xconsole
# you can comment out the destination line above that references /dev/tty12
# and uncomment the line below.
#destination console_all { file("/dev/console"); };
log { source(src); destination(console_all); };

iptables log to ulog

iptables: Getting full packets out of the kernel

syslog-ng howto

syslog-ng log server安裝說明

syslog-ng的特異功能
經常check system event一直是系統管理上很容易被忽略的重要細節，原因在於大部分的system logs傳達的資訊是不重要的，以至於將重要的警訊淹沒。
如同syslog-ng的名稱，它是用以取代syslogd的next generation版本，原來的syslog是只能夠依priority及facility作分類，syslog-ng可以根據log的內容，以 regular expression自訂分類及log的處理方式，並且支援以tcp/udp將log送到遠端的server，或是即時通知在線上的系統管理者，甚至能將 log值當成某個program的標準輸入字串，直接將log作加工及分析。

syslog-ng的應用與範例
Syslog-ng的message path全部在syslog-ng.conf作編輯。在syslog-ng的message path中，可以包括多個sources、多個filter rules及多個destinations的同步多工處理。一般來說，採用syslog-ng最大的用途除了運用它分類的功能外，就是運用它的網路功能。如前所提，syslog-ng可以透過tcp或是udp的協定將log送到遠端的server作集中式的監控。
現在我們先假設一個情境，我們希望將幾台提供網路服務的server的log集中到一台server作管理，我們希望能夠將log值中含有特定 facility code或特定priority的log儲存在本地端，並且將所有收集到的log值全部送到log server作分類以及儲存。原來的syslog做不到這麼多點，所以我們必須在每一台server安裝syslog-ng，各透過tcp送到log server，log server的syslog-ng.conf依server.conf的範例設定。詳細的syslog-ng config參數當然不只這些，實際設計系統時最好直接看看官方網站的reference manual (http://www.balabit.hu/static/syslog-ng/ ... book1.html) 。


server.conf

#設定options,語法為options { S1 ; S2 ; S3…};
#options主要是對整個configuration的共通設定

options { use_fqdn(yes); keep_hostname(yes); use_dns(yes); long_hostnames(off); sync(3); log_fifo_size(300); };

#設定input來源,語法為source source_ name {S1 ; S2 ; …};
#internal()表所有本機產生的log
# unix-stream("/dev/log")表來自本機的log檔,本機是Linux,若為BSD則需用
#unix-dgram，這是開啟一個AF_UNIX socket聽取目的檔案的message.
#收集本地端以及來自192.168.0.100 port 514這台server送來的log值,
#keep-alive(yes/no)決定是否保持連線直到收到sighup訊號

source src { unix-stream("/dev/log"); internal(); };
source remote { tcp(ip("127.0.0.100") port(514) keep-alive(yes)); };

#設定output目的地,語法為destination destination_ name {S1 ; S2 ; …};
#file(“path”):以檔案的方式存在local端
#usertty(“user_name”):即時通知特定的線上的使用者

destination lpr { file("/var/log/lpr.log"); };
destination mail { file("/var/log/mail.log"); };
destination messages { file("/var/log/messages"); };
destination console { usertty("root"); };

#設定filter條件,filter filter_name{expresson;}; expression為各條件以and or not
#連結
#facility(string1,string2):篩選出包含string1或string2其中之一個字串的log.
#level(S1..S2..S3) or priority(S1..S2..S3),篩選出包含其中之一level的log

filter f_lpr { facility(lpr); };
filter f_mail { facility(mail); };
filter f_messages { level(info..emerg) and not facility(mail,lpr); };
filter f_emergency { level(emerg); };

###############################################################
# 將設定好的source,filter,destination依需求作組合

log { source(src); filter(f_lpr); destination(lpr); };
log { source(src); filter(f_mail); destination(mail); };
log { source(src); filter(f_messages); destination(messages); };
log { source(src); filter(f_emergency); destination(console); };

###############################################################
#將收到的所有log依host分資料夾,再依該log的facility code分別儲存，
#並設定owner, group, permission, directory permission

destination hosts { file("/var/log/HOSTS/$HOST/$FACILITY" owner(root) group(root) perm(0600) dir_perm(0700) create_dirs(yes)); };

log { source(remote); destination(hosts); };
###############################################################
另外，我們必須將/etc/logrotate.d/syslog置換成以下syslog-ng:

/var/log/lpr.log {
postrotate
/bin/kill -HUP `cat /var/run/syslogd-ng.pid 2> /dev/null` 2> /dev/null ||
true
endscript
}

/var/log/mail.log {
postrotate
/bin/kill -HUP `cat /var/run/syslogd-ng.pid 2> /dev/null` 2> /dev/null ||
true
endscript
}

/var/log/messages {
postrotate
/bin/kill -HUP `cat /var/run/syslogd-ng.pid 2> /dev/null` 2> /dev/null ||
true
endscript
}

/var/log/lastlog {
postrotate
/bin/kill -HUP `cat /var/run/syslogd-ng.pid 2> /dev/null` 2> /dev/null ||
true
endscript
}
最後啟動syslog-ng！
#/etc/rc.d/init.d/syslog-ng start

結合database
(1)如果我們希望將log值存進資料庫的話，我們必須將先將database server建起來(這裡我們選擇mysql)，我們必須產生一個syslog.sql以建立資料庫及資料表:
CREATE DATABASE syslog;

USE syslog;

CREATE TABLE logs (
host varchar(32) default NULL,
facility varchar(10) default NULL,
priority varchar(10) default NULL,
level varchar(10) default NULL,
tag varchar(10) default NULL,
date date default NULL,
time time default NULL,
program varchar(15) default NULL,
msg text,
seq int(10) unsigned NOT NULL auto_increment,
PRIMARY KEY (seq),
KEY host (host),
KEY seq (seq),
KEY program (program),
KEY time (time),
KEY date (date),
KEY priority (priority),
KEY facility (facility)
) TYPE=MyISAM;
然後執行mysql -u root -p < syslog.sql。
(1) 執行mkfifo /tmp/mysql.pipe
(2) 執行mysql -u root --password=passwd syslog < /tmp/mysql.pipe
(3) 最後，我們必須在syslog-ng.conf增加一條command:
destination d_mysql {
pipe("/tmp/mysql.pipe"
template("INSERT INTO logs (host, facility, priority, level, tag, date, time, program, msg) VALUES ( '$HOST', '$FACILITY', '$PRIORITY', '$LEVEL', '$TAG', '$YEAR-$MONTH-$DAY', '$HOUR:$MIN:$SEC', '$PROGRAM', '$MSG' );\n") template-escape(yes));
};
log { source(net); destination(d_mysql);};
(4) 重開syslog-ng
/etc/init.d/syslog-ng stop # Stop syslog-ng
/etc/ini.d/syslog-ng start # Start syslog-ng

syslog(d) howto

syslog Overview

The standard UNIX syslog facilities are

    kern – kernel
    user – application or user processes (this is the default if the application sending a message does not specify the facility)
    mail/news/UUCP/cron – electronic mail/NNTP/UUCP/cron subsystems
    daemon – system daemons
    auth – authentication and authorization related commands
    lpr – line printer spooling subsystem
    mark – inserts timestamp into log data at regular intervals
    local0-local7 – 8 facilities for customized auditing
    syslog – internal messages generated by syslog itself
    authpriv – non-system authorization messages
    * -- on most versions of UNIX, refers to all facilities except mark

syslog levels are nominally defined as:

    emerg – system is or will be unusable if situation is not resolved
    alert – immediate action required
    crit – critical situations
    warning – recoverable errors
    notice – unusual situation that merits investigation; a significant event that is typically part of normal day-to-day operation
    info – informational messages
    debug – verbose data for debugging

Saturday, August 23, 2008

[BENCHMARK] Memory Test

memtester

$ memtester <MEMORY> [ITERATIONS]

MEMORY the amount of memory to allocate and test, in megabytes.
ITERATIONS (optional) number of loops to iterate through. Default is infinite.

Example == Linux: How do I find out causes for memory faults?

Tuesday, August 19, 2008

[SYSTEM] Memory Error on Tyan S2882 with AMD Opteron 248 + 8x 1GB Cosair ECC DDR400

memtester result with ECC disable

memtester version 4.0.8 (64-bit)
Copyright (C) 2007 Charles Cazabon.
Licensed under the GNU General Public License version 2 (only).

pagesize is 4096
pagesizemask is 0xfffffffffffff000
want 7000MB (7340032000 bytes)
got  7000MB (7340032000 bytes), trying mlock ...locked.
Loop 1:
  Stuck Address       : ok
  Random Value        : ok
  Compare XOR         : ok
  Compare SUB         : ok
  Compare MUL         : ok
  Compare DIV         : ok
  Compare OR          : ok
  Compare AND         : ok
  Sequential Increment: ok
  Solid Bits          : ok
  Block Sequential    : testing 123FAILURE: 0x7b7b7b7b7b7b7b7b != 0x7a7b7b7b7b7b7b7b at offset 0x062b25e3.
  Checkerboard        : ok
  ....

on loading large amount of memory

EDAC k8 MC1: general bus error: participating processor(local node response), time-out(no timeout) memory transaction type(generic read), mem or i/o(mem access), cache level(generic)
EDAC MC0: CE page 0xfc5b, offset 0x7d0, grain 8, syndrome 0xf654, row 2, channel 1, label "": k8_edac
EDAC k8 MC1: extended error code: ECC chipkill x4 error
EDAC k8 MC0: general bus error: participating processor(local node response), time-out(no timeout) memory transaction type(generic read), mem or i/o(mem access), cache level(generic)
EDAC MC0: CE page 0x1cfc16, offset 0x6c0, grain 8, syndrome 0x4472, row 0, channel 1, label "": k8_edac
EDAC k8 MC0: extended error code: ECC chipkill x4 error
EDAC k8 MC1: general bus error: participating processor(local node response), time-out(no timeout) memory transaction type(generic read), mem or i/o(mem access), cache level(generic)
EDAC MC0: CE page 0xea59, offset 0x830, grain 8, syndrome 0xf654, row 2, channel 1, label "": k8_edac
EDAC k8 MC1: extended error code: ECC chipkill x4 error

identify the DIMM

Re: EDAC chipkill messages
Machine Check Exception

machine check events

what does it mean ?

to show the contents

$ /usr/sbin/mcelog

[x86_64] how worried should I be about MCEs?

EDAC options in kernel and bios

EDAC Project
Chipkill Advanced ECC - Overview of How It Works
Speed vs. Precision
BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD Opteron Processors

corrected ecc error

   --------------------     
  | An Overview of ECC | 
   --------------------
                              
    Introduction
    ------------
      The scope of this discussion is limited to soft and hard errors that
      occur in memory and how they are reported by Solaris.  It does not
      account for errors that occur while data travels through the E10000
      interconnect, CPU Module, or I/O.  For this discussion, soft errors 
      are transient or temporary errors in memory that can be corrected by
      rewriting the affected memory cell.  Hard errors occur when a cell
      is permanently damaged and cannot hold the correct information. With
      a hard error, the cell can be permanently stuck-at "0", or "1".

    ECC Concepts
    ------------
      Any volatile storage medium, whether it be the Dynamic Random Access
      Memory (DRAM) used on main memory DIMMs or Static Random Access Memory
      (SRAM) mainly used for caches, is subject to occasional natural
      incidences of data loss due to the impact of alpha particles or cosmic
      rays. This data loss manifests itself in the changing of the value
      stored in the memory cell affected by the collision.  Typically only a
      single bit is affected, but there is a small probability that multiple
      cells can be upset.

      When a bit flips due to this phenomenon, it is referred to as a soft
      error.  This is to distinguish it from a hard error resulting from a
      hardware failure.  These soft errors happen at a rate, called the soft
      error rate (SER), that can be predicted as a function of the memory
      density, the memory technology, and the altitude of the system in which
      the memory resides.

      ECC was invented to allow survival from these naturally occurring
      losses of data.  The ECC method used on the E10000 is called a Single
      Error Correcting, Double Error Detecting code (SEC-DED).  The concept is
      that every word of data is written to memory along with a number of
      extra check bits.  When the word is read back from memory, a fresh set
      of check bits are recomputed and compared with the check that was
      stored in memory.  The result of this comparison is called the syndrome.

      If the syndrome is zero, the comparison was identical, and thus the
      data is good.  A non-zero syndrome means the data is in error, and the
      syndrome is used to find a single bit in error and correct it.  A
      single bit error is called a Correctable Error (CE).  The syndrome can
      also detect if two bits are in error, but it does not have enough
      information to identify which two bits.  This type of error is called
      an Uncorrectable Error (UE).  UltraSPARC microprocessors use a SEC-DED
      variant called S4ED that also can detect, but not correct, three or
      four bit errors if they are clustered within a four bit nibble.

Sunday, July 27, 2008

[CVS] note for CVS

using CVS over different SSH port

in .ssh/config

Host SomeCvsServer
HostKeyAlias cvs.some.server.com
Hostname cvs.some.server.com
Port 12345

cvs usage

$ export CVS_RSH=ssh
$ export CVSROOT=":ext:user@SomeCvsServer:/cvsroot"
$ cvs co project

Wednesday, July 23, 2008

[Q-CHEM] Build Q-Chem

add remkey without re-building whole q-chem

$ rm $QC/config/rem.conf
$ qcmake rem_config rem rem_input
$ rm -f $QC/qparser/*.o $QC/libgen*.o
$ qcmake qparser.a libgen.a
$ rm -f $QC/<probably program>/*.o
$ qcmake qcprogall

local copy of qcaux

As suggested in README.QCAUX: "It is recommended to store qcaux locally, especially drivers, as loading appears to be pretty intensive and puts a heavy tax on NFS, and may stall Q-CHEM when network is flaky."

[SGE] Parallel Environment

allocation rule

<integer>          allocate exactly this many slots per host 
$pe_slots          allocate as many slots on single host as stated on command line: qsub -pe <pe name> <slot range>
$fill_up           fill up one host, move to another, continue until range filled 
$round_robin       do round-robin allocation over all suitable hosts until range filled