Friday, November 27, 2009

PostgreSQL กับการกำหนดค่าที่น่าสนใจของ WEB, OLPT, DW

Well, that doesn't help unless we either provide a .conf generation tool (something I favor) or docs somewhere which explain which are the variables to be the most concerned with instead of making users read through all 218 of them.
Attached is the postgresql.conf.simple I used in my presentaiton. It has an egregious math error in it (see if you can find it) but should give you the general idea.
--Josh

# ----------------------------------------
# Simple PostgreSQL Configuration File
# ----------------------------------------

# This file provides a simple configuration with the most common options
# which most users need to modify for running PostgreSQL in production, 
# including extensive notes on how to set each of these.  If your configuration
# needs are more specific, then use the standard postgresql.conf, or add 
# additional configuration options to the bottom of this file.
#
# This file is re-read when you send a SIGHUP to the server, or on a full
# restart.  Note that on a SIGHUP simply recommenting the settings is not
# enough to reset to default value; the last explicit value you set will
# still be in effect.
#
# AvRAM:  Several of the formulas below ask for "AvRAM", which is short for
# "Available RAM".  This refers to the amount of memory which is available for
# running PostgreSQL.  On a dedicated PostgreSQL server, you can use the total
# system RAM, but on shared servers you need to estimate what portion of RAM
# is usually available for PostgreSQL.
#
# Each setting below lists one recommended starting setting, followed by
# several alternate settings which are commented out.  If multiple settings
# are uncommented, the *last* one will take effect.

# listen_addresses
# ------------------------
# listen_addresses takes a list of network interfaces the Postmaster will
# listen on.  The setting below, '*', listens on all interfaces, and is only
# appropriate for development servers and initial setup.  Otherwise, it 
# should be restrictively set to only specific addresses. Note that most
# PostgreSQL access control settings are in the pg_hba.conf file.

  listen_addresses = '*' # all interfaces 
# listen_addresses = 'localhost'  # unix sockets and loopback only
# listen_addresses = 'localhost,192.168.1.1' # local and one external interface

# max_connections
# ------------------------
# An integer setting a limit on the number of new connection processes which 
# PostgreSQL will create.  Should be set to the maximum number of connections 
# which you expect to need at peak load.  Note that each connection uses
# shared_buffer memory, as well as additional non-shared memory, so be careful
# not to run the system out of memory.  In general, if you need more than 1000
# connections, you should probably be making more use of connection pooling.
# 
# Note that by default 3 connections are reserved for autovacuum and 
# administration.

  max_connections = 200  # small server
# max_connections = 700  # web application database
# max_connections = 40   # data warehousing database

# shared_buffers
# ------------------------
# A memory quantity defining PostgreSQL's "dedicated" RAM, which is used
# for connection control, active operations, and more.  However, since
# PostgreSQL also needs free RAM for file system buffers, sorts and 
# maintenance operations, it is not advisable to set shared_buffers to a
# majority of RAM.  
#
# Note that increasing shared_buffers often requires you to increase some 
# system kernel parameters, most notably SHMMAX and SHMALL.  See 
# Operating System Environment: Managing Kernel Resources in the PostgreSQL
# documentation for more details.  Also note that shared_buffers over 2GB is 
# only supported on 64-bit systems.
#
# The setting below is a formula.  Calculate the resulting value, then
# uncomment it.  Values should be expressed in kB, MB or GB.

# shared_buffers = ( AvRAM / 4 )
# shared_buffers = 512MB   # basic 2GB web server
# shared_buffers = 8GB     # 64-bit server with 32GB RAM

# work_mem
# ------------------------
# This memory quantity sets the limit for the amount of non-shared RAM 
# available for each query operation, including sorts and hashes.  This limit
# acts as a primitive resource control, preventing the server from going
# into swap due to overallocation.  Note that this is non-shared RAM per
# *operation*, which means large complex queries can use multple times
# this amount.  Also, work_mem is allocated by powers of two, so round
# to the nearest binary step.

# The setting below is a formula.  Calculate the resulting value, then                                                                                                              
# uncomment it.  Values should be expressed in kB, MB or GB.  Maximum
# is currently 2GB.

# Most web applications should use the formula below, because their 
# queries often require no work_mem. 
# work_mem = ( AvRAM / max_connections ) ROUND DOWN to 2^x
# work_mem = 2MB  # for 2GB server with 700 connections  

# Formula for most BI/DW applications, or others running many complex
# queries:
# work_mem = ( AvRAM / ( 2 * max_connections ) ) ROUND DOWN to 2^x
# work_mem = 128MB   # DW server with 32GB RAM and 40 connections 

# maintenance_work_mem
# -------------------------
# This memory value sets the limit for the amount that autovacuum, 
# manual vacuum, bulk index build and other maintenance routines are 
# permitted to use.  Setting it to a moderately high value will increase
# the efficiency of vacuum and other operations.

# The setting below is a formula.  Calculate the resulting value, then                                                                                                                   
# uncomment it.  Values should be expressed in kB, MB or GB.  
# Maximum is currently 2GB.                                                                                                                           

# Formula for most databases
# maintenance_work_mem = ( AvRAM / 8 ) ROUND DOWN to 2^x
# maintenance_work_mem = 256MB  #webserver with 2GB RAM
# maintenance_work_mem = 2GB  #DW server with 32GB RAM

# max_fsm_pages
# --------------------------
# An integer which sets the maximum number of data pages with free space 
# which the Postmaster will track.  Setting this too low can lead to 
# table bloat and need for VACUUM FULL.  Should be set to the maximum number
# of data pages you expect to be updated between vacuums. 
#
# Increasing this setting requires dedicated RAM and like shared_buffers
# may require to to increase system kernel parameters.  Additionally, the
# recommended setting below is based on the default autovacuum settings;
# if you change the autovacuum parameters, then you may need to adjust
# this setting to match.

# The setting below is a formula.  Calculate the resulting value, then                                                                                                                   
# uncomment it.  DBsize is your estimate of the maximum size of the database;
# if the database is already loaded, you can get his from pg_database_size().
# For large data warehouses, use the volume of data which changes between 
# batch loads as your "DBSize"

# For small databases ( less than 10GB )
# max_fsm_pages = ( ( DBsize / 8kB ) / 8 )
# max_fsm_pages = 100000  #6GB web database 

# For larger databases ( Many GB to a few TB )
# max_fsm_pages = ( ( DBsize / 8kB ) / 16 )
# max_fsm_pages = 800000  #100GB OLTP database
# max_fsm_pages = 4000000  #DW loading 0.5TB data daily

# synchronous_commit
# -------------------------
# This boolean setting controls whether or not all of your transactions
# are gauranteed to be written to disk when they commit.  If you are
# willing to lose up to 0.4 seconds of data in the event of an unexpected 
# shutdown (as many web applications are), then you can gain substantial
# performance benefits by turning off synchronous commit.  For most
# applications, however, this setting is better used on a per-session 
# basis.

  synchronous_commit = on   #most applications
# synchronous_commit = off  #if speed is more important than data

# wal_buffers
# -------------------------
# this memory setting defines how much buffer space is available for 
# the Write Ahead Log.  Set too low, it can become a bottleneck on 
# inserts and updates; there is no benefit to setting it high, however.
# As with some of the other settings above, may require increasing
# some kernel parameters.

wal_buffers = 8MB
 
# checkpoint_segments
# -------------------------
# This integer defines the maximum number of 8MB transaction log segments
# PostgreSQL will create before forcing a checkpoint.  For most
# high-volume OTLP databases and DW you will want to increase this
# setting significantly.  Alternately, just wait for checkpoint 
# warnings in the log before increasing this.
#
# Increasing this setting can make recovery in the event of unexpected 
# shutdown take longer.
#
# Maximum disk space required is (checkpoint_segments * 2 + 1) * 16MB, 
# so make sure you have that much available before setting it.

checkpoint_segments = 16    #normal small-medium database
# checkpoint_segments = 64  #high-volume OLTP database
# checkpoint_segments = 128 #heavy-ETL large database

# autovacuum
# ---------------------------
# autovacuum turns on a maintenance daemon which runs in the background, 
# periodically cleaning up your tables and indexes.  The only reason to turn
# autovacuum off is for large batch loads (ETL).

  autovacuum = on   #most databases
# autovacuum = off  #large DW

# effective_cache_size
# --------------------------
# This memory setting tells the PostgreSQL query planner how much RAM
# is estimated to be available for caching data, in both shared_buffers and
# in the filesystem cache. This setting just helps the planner make good
# cost estimates; it does not actually allocate the memory.

# The setting below is a formula.  Calculate the resulting value, then                                                                                                                   
# uncomment it.

# effective_cache_size = ( AvRAM * 0.75 )

# default_statistics_target
# --------------------------
# This integer setting determines the histogram sample size for the 
# data about table contents kept by the query planner.  The default
# is fine for most databases, but often users need to increase it 
# either because they're running data warehouses or because they have
# a lot of poorly planned queries.

default_statistics_target = 10
# default_statistics_target = 200  #have had some bad plans
# default_statistics_target = 400  #data warehouse

# constraint_exclusion
# --------------------------
# This boolean setting should be turned "on" if you plan to use table 
# partitioning.  Otherwise, it should be "off".

  constraint_exclusion = off #in general
# constraint_exclusion = on  #if you plan to use partitioning

# log_destination & logging settings
# --------------------------
# This ENUM value determines where PostgreSQL's logs are sent.  What
# setting to use really depends on your server room setup and the 
# production status and OS of your server.
#
# Note that there are several dozen settings on what and how often
# to log; these will not be covered in detail in this quick 
# configuration file.  Instead, several common combinations are
# given.

# Syslog setup for centralized monitoring
# log_destination = 'syslog'
# syslog_facility = 'LOCAL0'  #local syslog
# syslog_facility = 'log_server_name'  #remote syslog

# Windows
# log_destination = 'eventlog'

# Private PostgreSQL Log
# log_destination = 'stderr'
# log_collector = on
# log_directory = '/path/to/log/dir'

# CSV logging for collecting performance statistics.
# Warning: this much logging will generate many log
# files and affect performance.
# log_destination = 'csvlog'
# log_collector = on
# log_directory = '/path/to/log/dir'
# log_duration = on
# log_temp_files = 256kB
# log_statement = 'all'
เดี๋ยวค่อยมาอธิบายครับ รอหน่อย

No comments: