本文共 4871 字,大约阅读时间需要 16 分钟。
-- pagecounts data comes from http://dumps.wikimedia.org/other/pagecounts-raw/-- documented http://www.mediawiki.org/wiki/Analytics/Wikistats-- define an external table over raw pagecounts dataCREATE TABLE IF NOT EXISTS pagecounts (projectcode STRING, pagenameSTRING, pageviews STRING, bytes STRING)ROW FORMATDELIMITED FIELDS TERMINATED BY ' 'LINES TERMINATED BY '\n'STORED AS TEXTFILELOCATION '/tmp/wikistats';-- create a view, building a custom hbase rowkeyCREATE VIEW IF NOT EXISTS pgc (rowkey, pageviews, bytes) ASSELECT concat_ws('/',projectcode,concat_ws('/',pagename,regexp_extract(INPUT__FILE__NAME, 'pagecounts-(\\d{8}-\\d{6})\\..*$', 1))),pageviews, bytesFROM pagecounts;-- create a table to hold the input split partitionsCREATE EXTERNAL TABLE IF NOT EXISTS hbase_splits(partition STRING)ROW FORMATSERDE 'org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe'STORED ASINPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveNullValueSequenceFileOutputFormat'LOCATION '/tmp/hbase_splits_out';-- create a location to store the resulting HFilesCREATE TABLE hbase_hfiles(rowkey STRING, pageviews STRING, bytes STRING)STORED ASINPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.hbase.HiveHFileOutputFormat'TBLPROPERTIES('hfile.family.path' = '/tmp/hbase_hfiles/w');b.创建HFils分隔文件,例子:sample.hql
-- prepate range partitioning of hfilesADD JAR /usr/lib/hive/lib/hive-contrib-0.11.0.1.3.0.0-104.jar;SET mapred.reduce.tasks=1;CREATE TEMPORARY FUNCTION row_seq AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';-- input file contains ~4mm records. Sample it so as to produce 5 inputsplits.INSERT OVERWRITE TABLE hbase_splitsSELECT rowkey FROM(SELECT rowkey, row_seq() AS seq FROM pgcTABLESAMPLE(BUCKET 1 OUT OF 10000 ON rowkey) sORDER BY rowkeyLIMIT 400) xWHERE (seq % 100) = 0ORDER BY rowkeyLIMIT 4;-- after this is finished, combined the splits file:dfs -cp /tmp/hbase_splits_out/* /tmp/hbase_splits;c.创建hfiles.hql
ADD JAR /usr/lib/hbase/hbase-0.94.6.1.3.0.0-104-security.jar;ADD JAR /usr/lib/hive/lib/hive-hbase-handler-0.11.0.1.3.0.0-104.jar;SET mapred.reduce.tasks=5;SET hive.mapred.partitioner=org.apache.hadoop.mapred.lib.TotalOrderPartitioner;SET total.order.partitioner.path=/tmp/hbase_splits;-- generate hfiles using the splits rangesINSERT OVERWRITE TABLE hbase_hfilesSELECT * FROM pgcCLUSTER BY rowkey;
mkdir /$Path_to_Input_Files_on_Hive_Client/wikistatswget http://dumps.wikimedia.org/other/pagecounts-raw/2008/2008-10/pagecounts-20081001-000000.gz hadoop fs -mkdir /$Path_to_Input_Files_on_Hive_Client/wikistatshadoop fs -put pagecounts-20081001-000000.gz /$Path_to_Input_Files_on_Hive_Client/wikistats/
$HCATALOG_USER-f /$Path_to_Input_Files_on_Hive_Client/tables.ddl执行之后,我们会看到如下的提示:
OKTime taken: 1.886 secondsOKTime taken: 0.654 secondsOKTime taken: 0.047 secondsOKTime taken: 0.115 seconds
$HIVE_USER-e "select * from pagecounts limit 10;"执行之后,我们会看到如下的提示:
...OKaa Main_Page 4 41431aa Special:ListUsers 1 5555aa Special:Listusers 1 1052再执行
$HIVE_USER-e "select * from pgc limit 10;"执行之后,我们会看到如下的提示:
...OKaa/Main_Page/20081001-000000 4 41431aa/Special:ListUsers/20081001-000000 1 5555aa/Special:Listusers/20081001-000000 1 1052...5.生成HFiles分隔文件
$HIVE_USER-f /$Path_to_Input_Files_on_Hive_Client/sample.hqlhadoop fs -ls /$Path_to_Input_Files_on_Hive_Client/hbase_splits为了确认,执行以下命令
hadoop jar /usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-104.jar -libjars /usr/lib/hive/lib/hive-exec-0.11.0.1.3.0.0-104.jar -input /tmp/hbase_splits -output /tmp/hbase_splits_txt -inputformatSequenceFileAsTextInputFormat执行之后,我们会看到如下的提示:
...INFO streaming.StreamJob: Output: /tmp/hbase_splits_txt
hadoop fs -cat /tmp/hbase_splits_txt/*执行之后,我们会看到类似这样的结果
61 66 2e 71 2f 4d 61 69 6e 5f 50 61 67 65 2f 32 30 30 38 31 30 30 31 2d 3030 30 30 30 00 (null)61 66 2f 31 35 35 30 2f 32 30 30 38 31 30 30 31 2d 30 30 30 30 30 30 00 (null)61 66 2f 32 38 5f 4d 61 61 72 74 2f 32 30 30 38 31 30 30 31 2d 30 30 3030 30 00 (null)61 66 2f 42 65 65 6c 64 3a 31 30 30 5f 31 38 33 30 2e 4a 50 47 2f 32 3038 31 30 30 31 2d 30 30 30 30 30 30 00 (null)6.生成HFiles
HADOOP_CLASSPATH=/usr/lib/hbase/hbase-0.94.6.1.3.0.0-104-security.jar hive -f /$Path_to_Input_Files_on_Hive_Client/hfiles.hql以上内容是hdp的用户手册中推荐的方式,然后我顺便也从网上把最后的一步的命令格式给找出来了。
hadoop jar hbase-VERSION.jar completebulkload /user/todd/myoutput mytable
转载地址:http://jzyjl.baihongyu.com/