环境下载地址
下载hadoop、hive、sqoop
上传文件及解压文件
修改hadoop的配置文件
- hadoop-env.sh、yarn、mapreduce 配置环境变量
export JAVA_HOME=/opt/modules/jdk1.7.0_67
- slaves
- core-site.xml
<property>
<name>dfs.namenode.secondary.http-address</name>
</property>
<property>
<name>dfs.namenode.http-address</name>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
- hdfs-site.xml
<property>
<name>dfs.namenode.secondary.http-address</name>
</property>
<property>
<name>dfs.namenode.http-address</name>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
- yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>60678</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>4092</value>
</property>
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>4</value>
</property>
- mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
</property>
格式化HDFS文件系统
命令:bin/hdfs namenode -format
启动服务
- hdfs服务
sbin/start-dfs.sh
- yarn服务
sbin/start-yarn.sh
- jobhistory服务
sbin/mr-jobhistory-daemon.sh start historyserver
配置hive
- 修改hive-env.sh
# Set HADOOP_HOME to point to a specific hadoop install directory
HADOOP_HOME=/opt/cdh5.3.6/hadoop-2.5.0-cdh5.3.6
# Hive Configuration Directory can be controlled by:
export HIVE_CONF_DIR=/opt/cdh5.3.6/hive-0.13.1-cdh5.3.6/conf
- hive-log4j.properties.template
# Define some default values that can be overridden by system properties
hive.log.threshold=ALL
hive.root.logger=WARN,DRFA
hive.log.dir=/opt/cdh5.3.6/hive-0.13.1-cdh5.3.6/logs
hive.log.file=hive.log
- 创建hivesite文件并初始化
vi hive-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<description>JDBC connect string for a JDBC metadata</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
<description>password to use against metastore database</description>
</property>
<property>
<name>hive.cli.print.header</name>
<value>true</value>
<description>Whether to print the names of the columns in query output.</description>
</property>
<property>
<name>hive.cli.print.current.db</name>
<value>true</value>
<description>Whether to include the current database in the Hive prompt.</description>
</property>
<property>
<name>hive.fetch.task.conversion</name>
<value>more</value>
<description>
Some select queries can be converted to single FETCH task minimizing latency.
Currently the query should be single sourced not having any subquery and should not have
any aggregations or distincts (which incurs RS), lateral views and joins.
1. minimal : SELECT STAR, FILTER on partition columns, LIMIT only
2. more : SELECT, FILTER, LIMIT only (TABLESAMPLE, virtual columns)
</description>
</property>
</configuration>
- 创建hive连接的mysql数据
create database matadata;
-
测试hive并解决异常
解决办法上传mysql连接的jar包
cp /opt/modules/hive-0.13.1/lib/mysql-connector-java-5.1.27-bin.jar /opt/cdh5.3.6/hive-0.13.1-cdh5.3.6/lib/
- 创建元数据存储目录
bin/hdfs dfs -mkdir -p /user/hive/warehouse
- 把组下的所有用户都有对/user/hive/warehouse目录写的权限
bin/hdfs dfs -chmod g+w /user/hive/warehouse
测试hive、hdfs、MapReduce
- 创建一个表 (hive)
create table student(id int, name string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
- 导入数据(hdfs)
load data local inpath '/opt/datas/student.txt'into table student ;
- 查询数据的个数(MapReduce)
select count(1) from student;
查看文件系统web
查看yarn的web
sqoop配置
- 修改配置文件
- 复制mysql连接jar包
cp /opt/sofewares/mysql-libs/mysql-connector-java-5.1.27/mysql-connector-java-5.1.27-bin.jar /opt/cdh5.3.6/sqoop-1.4.5-cdh5.3.6/lib/
-
查看mysql数据中有多少数据库
bin/sqoop list-databases \
--connect \
--username root \
--password 123456 \